def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. "CLUSTAL format alignment by MAFFT ..." #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdin, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(stdin.return_code, 0) self.assert_(stdout.read().startswith("CLUSTAL format alignment by MAFFT")) self.assert_("$#=0" not in stderr.read()) self.assertEqual(str(stdin._cl), mafft_exe \ + " --clustalout Fasta/f002")
def mafft_align(fa_path, afa_path): """Align amino acid FASTA file. Takes amino-acid seqs from fa_path and writes aligned amino-acids to afa_path. """ mafft_call = MafftCommandline(input = fa_path) mafft_call.maxiterate = 1000 mafft_call.retree = 2 stdout, stderr = mafft_call() open(afa_path, "w").write(stdout) open("%s.err" % afa_path, 'w').write(stderr)
def align(cls, seq_records, outfile=None): '''Align given sequences @param seq_records: a list of SeqRecords objects @param outfile: a filename for the output alignment or None @return: if the outfile is none, return an AlignmentExt object; otherwise return True on success. In both cases return None on error.''' if not outfile: outfile = mktmp_name('.aln.fasta') remove_out = True else: remove_out = False msafile = mktmp_fasta(seq_records) args = dict(thread=-1, input=msafile) if len(seq_records) < 10000: args['auto'] = True else: args['parttree'] = True args['partsize'] = 1000 ali = None if run_cline(MafftCommandline(**args), stdout=outfile): if os.path.isfile(outfile) and os.path.getsize(outfile) > 0: if remove_out: ali = AlignmentExt.from_msa(AlignIO.read(outfile, 'fasta')) else: ali = True else: ali = False if remove_out: safe_unlink(outfile) safe_unlink(msafile) return ali
def align_sequences(fasta_temp_dir, alignment_temp_dir, wd): os.chdir(fasta_temp_dir) print('aligning each sample sequence to reference genome') n = 0 for file in glob.glob('*.fasta'): n = n + 1 print(n) sample_seq_name = file.split('.fasta')[0] # for record in SeqIO.parse(file, 'fasta'): # if record.id != ref_id: # sample_seq_name = record.id # create outpath file name for alignment alignment_file_name = os.path.join( alignment_temp_dir, '%s.alignment.fasta' % sample_seq_name) if not os.path.isfile(alignment_file_name): # do alignment mafft_cline = MafftCommandline(input=file) print(mafft_cline) stdout, stderr = mafft_cline() with open(alignment_file_name, 'w') as handle: handle.write(stdout)
def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(result.return_code, 0) output = stdout.read() #e.g. "CLUSTAL format alignment by MAFFT ..." #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" self.assert_(output.startswith("CLUSTAL"), output) self.assert_("$#=0" not in stderr.read()) self.assertEqual(str(result._cl), mafft_exe \ + " --clustalout Fasta/f002")
def mafft_align(file): stdout, stderr = MafftCommandline( input=file, auto=True, )() with open(f"{os.path.splitext(file)[0]}.fasta.mafft", "w") as aligned: aligned.write(stdout)
def call_mafft(genefile): """Calls MAFFT to generate an alignment. Parameters ---------- genefile : str a string with the name/path for the FASTA file. Returns ------- bool True if sucessful, False otherwise. """ try: mafft_cline = MafftCommandline( input=genefile, adjustdirection=True, treeout=True, thread=1, retree=1, maxiterate=0, ) stdout, stderr = mafft_cline() path_to_save = genefile.replace("_prot.fasta", "_aligned.fasta") with open(path_to_save, "w") as handle: handle.write(stdout) return True except Exception as e: print(e) return False
def mafft_alignment(mafft_cmd, *args): fa_fpath = '/dev/shm/tmp.fa' mafft_fpath = '/dev/shm/tmp.mafft' # Write seqs to fasta file with open(fa_fpath, 'w') as out: for i, s in enumerate(args): out.write('>{}\n'.format(i)) out.write('{}\n'.format(s)) # Align mf_cline = MafftCommandline(mafft_cmd, input=fa_fpath) stdout, stderr = mf_cline() with open(mafft_fpath, 'w') as out: out.write(stdout) #check_call([mafft_cmd, '--quiet', fa_fpath, '>', mafft_fpath], shell=True) # Read and order output alignment = [ (i, str(rec.seq)) for i, rec in enumerate(SeqIO.parse(open(mafft_fpath), 'fasta')) ] output = [s.upper() for i, s in sorted(alignment)] # Delete files os.remove(fa_fpath) os.remove(mafft_fpath) return output
def pool_write_microalignment(mblocknum, targetdata, extendedsourcedata, nbinitialsource, all_ids, msamethod): aln = {} i = mblocknum[0] mblock = mblocknum[1] input_muscle_file = "input_muscle.fasta" + str(i) output_muscle_file = "output_muscle.fasta" + str(i) input_muscle = open(input_muscle_file, "w") nbseq = 0 for gene in targetdata: geneid, geneseq = gene if geneid in mblock.keys() and mblock[geneid][1] > mblock[geneid][0]: input_muscle.write(">" + geneid + "\n" + geneseq[mblock[geneid][0]:mblock[geneid][1]] + "\n") nbseq += 1 for j in range(nbinitialsource): cds = extendedsourcedata[j] cdsid, cdsseq, cdsgeneid, null = cds if cdsid in mblock.keys() and mblock[cdsid][1] > mblock[cdsid][0]: input_muscle.write(">" + cdsid + "\n" + cdsseq[mblock[cdsid][0]:mblock[cdsid][1]] + "\n") nbseq += 1 input_muscle.close() msa = [] if (nbseq > 0): if (msamethod == "muscle"): muscle_cline = MuscleCommandline(input=input_muscle_file, out=output_muscle_file, gapopen=-800.0) stdout, stderr = muscle_cline() else: # msamethod == "mafft" mafft_cline = MafftCommandline(input=input_muscle_file) stdout, stderr = mafft_cline() with open(output_muscle_file, "w") as handle: handle.write(stdout) msa = AlignIO.read(output_muscle_file, "fasta") else: open(output_muscle_file, "w").close() present_ids = [] length = 0 for record in msa: present_ids.append(record.id) aln[record.id] = record.seq length = len(record.seq) for id in all_ids: if (id not in present_ids): aln[id] = '-' * length os.remove(input_muscle_file) os.remove(output_muscle_file) return aln
def do_alignment(fasta, threads): # Run MAFFT alignment align_cmd = MafftCommandline(input=fasta, retree=1, maxiterate=0, thread=int(threads)) align_so, align_se = align_cmd() align = AlignIO.read(io.StringIO(align_so), "fasta") return align
def MakeAlignments(seqs,name,path): ##aligns exported data if os.path.isfile(path + name + '_aligned.txt') is False: in_file = seqs mafft_cline = MafftCommandline(input=in_file, auto=True, reorder=True) stdout, stderr = mafft_cline() handle = open(path + name + '_aligned.txt', 'w') handle.write(stdout) handle.close()
def test_Mafft_with_options(self): """Simple round-trip through app with infile and options, result passed to stdout.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("maxiterate", 100) cmdline.set_parameter("--localpair", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertNotIn("$#=0", stderrdata)
def run_mafft(mafftdir, outputDir, fastaFileList, outputFileNameList, startNum, endNum, algorithm): # Set up import platform for i in range(startNum, endNum): # Run MAFFT if platform.system() == 'Windows': mafft_cline = MafftCommandline(os.path.join( mafftdir, 'mafft.bat'), input=fastaFileList[i]) else: mafft_cline = MafftCommandline(os.path.join(mafftdir, 'mafft'), input=fastaFileList[i]) if algorithm != None: if algorithm.lower() == 'genafpair': mafft_cline.genafpair = True elif algorithm.lower() == 'localpair': mafft_cline.localpair = True elif algorithm.lower() == 'globalpair': mafft_cline.globalpair = True stdout, stderr = mafft_cline() if stdout == '': raise Exception('MAFFT error text below' + str(stderr)) # Process MAFFT output stdout = stdout.split('\n') while stdout[-1] == '\n' or stdout[-1] == '' or stdout[ -1] == 'Terminate batch job (Y/N)?\n': # Remove junk, sometimes MAFFT will have the 'Terminate ...' line del stdout[-1] stdout = '\n'.join(stdout) # Create output alignment files with open(outputFileNameList[i], 'w') as fileOut: fileOut.write(stdout)
def mafft_align(query_seq, target_seq, query_name, target_name, align_method="local", directory="./", quiet=False): # add time to file name to make it unique '20160809-144522_' 2016-08-09 14:45:22 file_name = directory + datetime.now().strftime("%Y%m%d_%H%M%S_") + query_name + ".fasta" with open(file_name, 'w') as data_out: data_out.write(">{}\n{}\n>{}\n{}".format(target_name, target_seq, query_name, query_seq)) if align_method == "local": mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, localpair=True, maxiterate=1000, quiet=quiet) else: mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, globalpair=True, maxiterate=1000, quiet=quiet) out, _ = mafft_cline() align = AlignIO.read(StringIO(out), "fasta") my_list = list(align) # target name, target seq, query name, query seq os.remove(file_name) return my_list[0].id, my_list[0].seq, my_list[1].id, my_list[1].seq
def align(fasta): # MAFFT needs to be in the path in_file = os.path.relpath(fasta) mafft_cline = MafftCommandline(input=in_file) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") sequence1=str(align[0].seq) sequence2=str(align[1].seq) return [sequence1,sequence2]
def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) return_code = child.wait() self.assertEqual(return_code, 0) output = child.stdout.read() #e.g. "CLUSTAL format alignment by MAFFT ..." #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" self.assert_(output.startswith("CLUSTAL"), output) self.assert_("$#=0" not in child.stderr.read()) del child
def create_msa(fasta_infile, msa_fasta, msa_phy): "Creates a multiple sequence alignment with mafft in phylip format" mafft_cline = MafftCommandline( input=fasta_infile) #Create mafft command line stdout, stderr = mafft_cline() #save mafft output into variable with open(msa_fasta, 'w') as handle: handle.write(stdout) #write mafft output in fasta format AlignIO.convert( msa_fasta, "fasta", msa_phy, "phylip-relaxed") #convert mafft output from fasta to phylip
def mafft(infile): from Bio.Align.Applications import MafftCommandline from io import StringIO from Bio import AlignIO mafft_cline = MafftCommandline("mafft", input=infile) print(mafft_cline) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") outfile = infile.replace('.fasta', '_mafft.aln') AlignIO.write(align, outfile, "clustal")
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO, SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id, "has premature stops, discarding") bad_seq += '*' in str(tempseq)[:-1] print('Number of sequences with stops:', bad_seq, 'out of total', len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") elif alignment_tool == 'mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:', alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id: seq for seq in self.aln} # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def align_with_mafft(filepath, localpair=False, maxiterate=1000): """ Align a file with the given filepath using MAFFT :param filepath: The file to align :param localpair: Should we use the l-insi method :return: The MAFFT alignment """ mafft_cline = MafftCommandline(input=filepath, localpair=localpair, maxiterate=maxiterate) stdout, stderr = mafft_cline() align = AlignIO.read(io.StringIO(stdout), "fasta") return align
def test_Mafft_simple(self): """Simple round-trip through app with infile. Result passed to stdout. """ #Use a keyword argument at init, cmdline = MafftCommandline(mafft_exe, input=self.infile1) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("Progressive alignment ..." in stderrdata, stderrdata) self.assertTrue("$#=0" not in stderrdata)
def align_cluster(self, cluster_file): """ Worker fuction for align_clusters Inputs a FASTA file containing an unaligned sequence cluster. Uses MAFFT to align the cluster. """ mafft_cline = MafftCommandline(input=cluster_file) mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) color = Color() print(color.red + str(mafft_cline) + color.done) sys.stdout.flush() if cluster_file.find("/") != -1: alignment_file = "alignments" + cluster_file[cluster_file.index("/"):] else: alignment_file = "alignments/" + cluster_file stdout, stderr = mafft_cline() with open(alignment_file, "w") as handle: handle.write(stdout) return alignment_file
def align(self): if self.align_software == 'mafft': mafft_cline = MafftCommandline( cmd=self.mafft_path, input=self.pair_pep_file, auto=True) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") AlignIO.write(align, self.prot_align_file, "fasta") if self.align_software == 'muscle': muscle_cline = MuscleCommandline( cmd=self.muscle_path, input=self.pair_pep_file, out=self.prot_align_file, seqtype="protein", clwstrict=True) stdout, stderr = muscle_cline()
def align_fasta(in_file_loc): # Gets the base file *.fa out_file_base = in_file_loc.split(".fa")[0] + ".aln" # Runs command line to work with mafft mafft_cline = MafftCommandline(input=in_file_loc) # runs mafft using what our file was and to an output of base.aln stdout, stderr = mafft_cline() with open(out_file_base, "w") as handle: handle.write(stdout)
def test_Mafft_with_Clustalw_output(self): """Simple round-trip through app with clustal output""" cmdline = MafftCommandline(mafft_exe) #Use some properties: cmdline.input = self.infile1 cmdline.clustalout = True self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) stdoutdata, stderrdata = child.communicate() return_code = child.returncode self.assertEqual(return_code, 0, "Got error code %i back from:\n%s" % (return_code, cmdline)) #e.g. "CLUSTAL format alignment by MAFFT ..." #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)" self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata) self.assertTrue("$#=0" not in stderrdata) del child
def call_mafft_0(in_file, out_file): #mafft_exe = "D:\Gal\MultiCrisper\mafft-7.245-win64\mafft-win\mafft.bat" #in_file = "../Doc/examples/opuntia.fasta" #mafft_cline = MafftCommandline(mafft_exe, input=in_file) mafft_cline = MafftCommandline(input=in_file) print(mafft_cline) stdout, stderr = mafft_cline() with open(out_file, "w") as handle: handle.write(stdout) ##from Bio import AlignIO ##not in use for now ## align = AlignIO.read("aligned.fasta", "fasta") ##not in use for now return out_file
def run_msa(fasta_path, out_dir, bubble_num): mafft_cline = MafftCommandline(input=fasta_path) print('Performing MSA on bubble number', bubble_num) # run MAFFT stdout, stderr = mafft_cline() # write the MSA to a file with open(os.path.join(out_dir, 'msa-' + bubble_num + '.fasta'), 'w') as fh: fh.write(stdout)
def align_cluster(self, cluster_file): """ Worker fuction for align_clusters Inputs a FASTA file containing an unaligned sequence cluster. Uses MAFFT to align the cluster. """ mafft_cline = MafftCommandline(input=cluster_file) mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) color = Color() print(color.red + str(mafft_cline) + color.done) sys.stdout.flush() if cluster_file.find("/") != -1: alignment_file = "alignments" + cluster_file[cluster_file.index("/" ):] else: alignment_file = "alignments/" + cluster_file try: stdout, stderr = mafft_cline() with open(alignment_file, "w") as handle: handle.write(stdout) except: print( color.red + "Error: alignment file not generated. Please check your MAFFT installation." + color.done) return alignment_file
def align(self): if self.align_software == 'mafft': mafft_cline = MafftCommandline(cmd=self.mafft_path, input=self.sequencefile, auto=True) stdout, stderr = mafft_cline() align = AlignIO.read(StringIO(stdout), "fasta") AlignIO.write(align, self.alignfile, "fasta") if self.align_software == 'muscle': muscle_cline = MuscleCommandline(cmd=self.muscle_path, input=self.sequencefile, out=self.alignfile) stdout, stderr = muscle_cline()
def test_Mafft_with_PHYLIP_output(self): """Simple round-trip through app with PHYLIP output""" cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. " 3 706\n" but allow some variation in the column count self.assertTrue(stdoutdata.startswith(" 3 70"), stdoutdata) self.assertTrue("gi|1348912 " in stdoutdata, stdoutdata) self.assertTrue("gi|1348912|gb|G26680|G26680" not in stdoutdata, stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def alignSeqs(in_file, out_file, mafft_bat): cline = MafftCommandline(mafft_bat, input=in_file) # print(cline) [stdout, stderr] = cline() with open(out_file, 'w+') as handle: handle.write(stdout) with open('error.txt', 'w+') as handle: handle.write(stderr) return True
def test_Mafft_simple(self): """Simple round-trip through app with infile, result passed to stdout.""" # Use a keyword argument at init, cmdline = MafftCommandline(mafft_exe, input=self.infile1) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) # Used to get "Progressive alignment ..." but in v7.245 # became "Progressive alignment 1/2..." and "Progressive alignment 2/2..." self.assertTrue(("Progressive alignment ..." in stderrdata) or ("Progressive alignment 1/" in stderrdata), stderrdata) self.assertNotIn("$#=0", stderrdata)
def test_Mafft_with_PHYLIP_namelength(self): """Check PHYLIP with --namelength""" cmdline = MafftCommandline(mafft_exe, input=self.infile1, phylipout=True, namelength=50) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) stdoutdata, stderrdata = cmdline() #e.g. " 3 706\n" or " 3 681" but allow some variation in the column count self.assertTrue(stdoutdata.startswith(" 3 68") or stdoutdata.startswith(" 3 69") or stdoutdata.startswith(" 3 70"), stdoutdata) self.assertTrue("gi|1348912|gb|G26680|G26680" in stdoutdata, stdoutdata) self.assertTrue("$#=0" not in stderrdata)
def call_mafft(path_to_save, genefile): try: print "maffting " + os.path.basename(genefile) mafft_cline = MafftCommandline(input=genefile) stdout, stderr = mafft_cline() with open(path_to_save, "w") as handle: handle.write(stdout) return True except Exception as e: print e return False
def align_seqs(pool_input): counter, total, sequence_collection, aligner, gap_opening_penalty, gap_extension_penalty, no_trim, trimal_setting, window_size, seq_proportion, conserve_alignment_percentage, min_length, outdir = pool_input filename = os.path.basename(sequence_collection).replace( 'sequence_collection_locus_', '') if aligner == 'mafft': cline = MafftCommandline(input=sequence_collection, adjustdirection=True, maxiterate=1000, op=gap_opening_penalty, ep=gap_extension_penalty) elif aligner == 'muscle': cline = MuscleCommandline(input=sequence_collection, maxiters=1000, gapopen=gap_opening_penalty, gapextend=gap_extension_penalty) stdout, stderr = cline() alignment_out = os.path.join(outdir, filename) sys.stdout.write('\rAligning sequence collections %i/%i ' % (int(counter + 1), total)) sys.stdout.flush() with open(alignment_out, "w") as handle: handle.write(stdout) if not no_trim: # trim alignments with trimal if trimal_setting != 'manual': cmd = [ "trimal", "-in", alignment_out, "-out", alignment_out, '-%s' % trimal_setting ] else: cmd = [ "trimal", "-in", alignment_out, "-out", alignment_out, '-w', str(window_size), '-gt', str(seq_proportion), '-cons', str(conserve_alignment_percentage) ] # run trimal command proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stderr, stdout = proc.communicate() if min_length: align = AlignIO.read(alignment_out, "fasta") al_length = len(align[0]) if al_length < min_length: # delete file if smaller than minlength os.remove(alignment_out) #too_short_alignments.append(filename.replace('.fasta','')) return (filename.replace('.fasta', '') ) # Return locus name in case alignment is too short
def mafft(in_file: str): """ MAFFT command line for MSA. Args: in_file [str]: Input file """ mafft_cline = MafftCommandline(input=in_file) stdout, stderr = mafft_cline() print(mafft_cline) with open("gisaid_results/aligned.fasta", "w") as file: file.write(stdout)
def test_Mafft_with_options(self): """Simple round-trip through app with infile and options. Result passed to stdout. """ cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("maxiterate", 100) cmdline.set_parameter("--localpair", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(result.return_code, 0) self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680")) self.assert_("$#=0" not in stderr.read()) self.assertEqual(str(result._cl), mafft_exe \ + " --localpair --maxiterate 100 Fasta/f002")
def test_Mafft_with_options(self): """Simple round-trip through app with infile and options. Result passed to stdout. """ cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("maxiterate", 100) cmdline.set_parameter("--localpair", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) return_code = child.wait() self.assertEqual(return_code, 0) self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680")) self.assert_("$#=0" not in child.stderr.read()) del child
def test_Mafft_with_options(self): """Simple round-trip through app with infile and options. Result passed to stdout. """ cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("maxiterate", 100) cmdline.set_parameter("--localpair", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) stdoutdata, stderrdata = child.communicate() return_code = child.returncode self.assertEqual(return_code, 0, "Got error code %i back from:\n%s" % (return_code, cmdline)) self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("$#=0" not in stderrdata) del child
def test_Mafft_with_complex_command_line(self): """Round-trip with complex command line.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("--localpair", True) cmdline.set_parameter("--weighti", 4.2) cmdline.set_parameter("retree", 5) cmdline.set_parameter("maxiterate", 200) cmdline.set_parameter("--nofft", True) cmdline.set_parameter("op", 2.04) cmdline.set_parameter("--ep", 0.51) cmdline.set_parameter("--lop", 0.233) cmdline.set_parameter("lep", 0.2) cmdline.set_parameter("--reorder", True) cmdline.set_parameter("--treeout", True) cmdline.set_parameter("nuc", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) self.assertEqual(str(cmdline), mafft_exe + " --localpair --weighti 4.2 --retree 5 " + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" + " --lop 0.233 --lep 0.2 --reorder --treeout" + " --nuc Fasta/f002") stdoutdata, stderrdata = cmdline() self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("$#=0" not in stderrdata)
def test_Mafft_with_complex_command_line(self): """Round-trip with complex command line.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("--localpair", True) cmdline.set_parameter("--weighti", 4.2) cmdline.set_parameter("retree", 5) cmdline.set_parameter("maxiterate", 200) cmdline.set_parameter("--nofft", True) cmdline.set_parameter("op", 2.04) cmdline.set_parameter("--ep", 0.51) cmdline.set_parameter("--lop", 0.233) cmdline.set_parameter("lep", 0.2) cmdline.set_parameter("--reorder", True) cmdline.set_parameter("--treeout", True) cmdline.set_parameter("nuc", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) self.assertEqual(str(cmdline), mafft_exe \ + " --localpair --weighti 4.2 --retree 5 " \ + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \ + " --lop 0.233 --lep 0.2 --reorder --treeout" \ + " --nuc Fasta/f002") child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) stdoutdata, stderrdata = child.communicate() return_code = child.returncode self.assertEqual(return_code, 0, "Got error code %i back from:\n%s" % (return_code, cmdline)) self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680")) self.assertTrue("$#=0" not in stderrdata) del child
def test_Mafft_with_complex_command_line(self): """Round-trip with complex command line.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("--localpair", True) cmdline.set_parameter("--weighti", 4.2) cmdline.set_parameter("retree", 5) cmdline.set_parameter("maxiterate", 200) cmdline.set_parameter("--nofft", True) cmdline.set_parameter("op", 2.04) cmdline.set_parameter("--ep", 0.51) cmdline.set_parameter("--lop", 0.233) cmdline.set_parameter("lep", 0.2) cmdline.set_parameter("--reorder", True) cmdline.set_parameter("--treeout", True) cmdline.set_parameter("nuc", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) self.assertEqual(str(cmdline), mafft_exe \ + " --localpair --weighti 4.2 --retree 5 " \ + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \ + " --lop 0.233 --lep 0.2 --reorder --treeout" \ + " --nuc Fasta/f002") child = subprocess.Popen(str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32")) return_code = child.wait() self.assertEqual(return_code, 0) self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680")) self.assert_("$#=0" not in child.stderr.read()) del child
def main(): print("\n\nmatrix_maker.py\n\n") print("Getting all taxid...\n") print("Writing taxids to file taxids.txt...\n") taxids_file = open("taxids.txt", "w") name_file = open(taxa_file) names = name_file.readlines() taxids = [] import time for name in names: name = "%s" %(name.split()[0]) taxid = get_taxon_id(name) name_taxid_text = name + "\t" + taxid print(name_taxid_text) taxids_file.write(name_taxid_text + "\n") taxids.append( taxid ) # dont overload genbank time.sleep(0.1) taxids_file.close() print("\nDownloading sequences for each taxid...\n") #Keeping the longest sequence for each taxon...\n") from Bio import Entrez from Bio import SeqIO final_records = [] for taxid in taxids: if taxid != "not found": records = get_sequences(taxid) # keep all records final_records = final_records + records # dont overload genbank time.sleep(0.2) # find the longest sequence #longest_len = 0 #longest_seq = None #for record in records: # if len(record) > longest_len: # longest_len = len(record) # longest_seq = record #if longest_seq != None: # final_records.append(longest_seq) print("\nGenerating unaligned FASTA file with GenBank formatted description...\n") SeqIO.write(final_records, "output_unaligned_gb_format.fasta", "fasta") print("Generating unaligned FASTA file with custom formatted description...\n") unaligned_file = open("output_unaligned_custom_format.fasta", "w") for record in final_records: # remove the organism name from the description description = record.description if description.find(record.annotations["organism"] + " ") != -1: description = description.replace(record.annotations["organism"] + " ", "") # custom format for Andrew: >Organism name_accession_description description = record.annotations["organism"] + "_" + record.id + "_" + description description = description.replace(" ", "_") unaligned_file.write(">" + description + "\n") unaligned_file.write(str(record.seq) + "\n") unaligned_file.close() print("Making alignment with MAFFT...") try: from Bio.Align.Applications import MafftCommandline mafft_cline = MafftCommandline(input="output_unaligned_custom_format.fasta") mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) print(str(mafft_cline)) stdout, stderr = mafft_cline() print("Writing alignment to FASTA file...\n") with open("output_aligned.fasta", "w") as handle: handle.write(stdout) except: print("Problem finding MAFFT, alignment skipped.") print("Done!\n")
def main(): # parse the command line arguments parser = argparse.ArgumentParser() parser.add_argument("--email", "-e", help="Email address for NCBI database searches.") parser.add_argument("--genes", "-g", help="Text file that contains a list of all gene names.") parser.add_argument( "--max_seq_length", "-m", help="Optional. Sets the maximum sequence length to include. Use this to exclude genomes.", ) parser.add_argument( "--species", "-s", help="Text file that contains a list of all species binomials and their synonyms." ) parser.add_argument( "--taxids", "-t", help="Optional. Text file that contains a list of all taxids. Use this to avoid repeating the NCBI taxid lookups.", ) args = parser.parse_args() print("\n\nmatrix_maker.py\n\n") if not args.email: print( "NCBI requires an email address for database searches. Please use the --email flag to specify an email address.\n" ) sys.exit(0) else: email = args.email if not args.species or not os.path.isfile(args.species): print("Please specify a valid list of taxa to search for.\n") sys.exit(0) if args.max_seq_length: max_seq_length = int(args.max_seq_length) else: max_seq_length = -1 genes = [] if not args.genes or not os.path.isfile(args.genes): print("Please specify a valid list of genes to search for.\n") sys.exit(0) else: # read in gene names.... # format of file: # gene_name,include,rbcL,RBCL # gene_name,exclude,RRRBCL with open(args.genes, "rb") as csvfile: genereader = csv.reader(csvfile, delimiter=",") for row in genereader: if row[1] == "include": gene = Gene(row[0]) for i in range(2, len(row)): if row[i] != "": gene.gene_names.append(row[i]) genes.append(gene) if row[1] == "exclude": for gene in genes: if gene.name == row[0]: for i in range(2, len(row)): if row[i] != "": gene.exclusions.append(row[i]) # list of all taxon objects taxa = [] # check for taxid print("Checking for taxids csv file...") if args.taxids and os.path.isfile(args.taxids): with open(args.taxids, "rb") as csvfile: print("Found taxids csv file, reading taxids...\n") taxidsreader = csv.reader(csvfile, delimiter=",") for row in taxidsreader: taxa.append(Taxon(row[0], row[1])) else: print("No taxids csv file found.\n") # open species list file, get synonyms and any missing taxids with open(args.species, "rb") as csvfile: print("Checking list of species, getting missing taxids from NCBI...") taxids_file = open("taxids.csv", "w") namesreader = csv.reader(csvfile, delimiter=",") i = 1 num_lines = sum(1 for line in open(args.species)) for row in namesreader: # update status percent = str(round(100 * i / float(num_lines), 2)) sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)") sys.stdout.flush() i += 1 # check to see if we already have a taxid for this species found = False for taxon in taxa: if taxon.binomial == row[0]: found = True taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n") # add synonyms for j in range(1, len(row)): taxon.synonyms.append(row[j]) break if not found: # get the taxid from NCBI taxon = Taxon(row[0]) taxon.get_taxid(email) # dont overload genbank time.sleep(0.1) taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n") # add synonyms for j in range(1, len(row)): taxon.synonyms.append(row[j]) taxa.append(taxon) taxids_file.close() print("\nWriting all taxids to file taxids.csv...") print("\nDownloading sequences from NCBI...") for gene in genes: print("\nSearching for gene: " + gene.name) i = 1 for taxon in taxa: # update status percent = str(round(100 * i / float(len(taxa)), 2)) sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)") sys.stdout.flush() i += 1 if taxon.taxid != "not found": taxon.get_sequences(email, gene) # dont overload genbank time.sleep(0.2) print("\nGenerating unaligned FASTA file...") unaligned_file = open(gene.name + ".fasta", "w") for taxon in taxa: record = taxon.get_longest_seq(gene.name, max_seq_length) if record != None: # output format: >binomial_accession_description description = taxon.binomial + "_" + record.id + "_" + record.description description = description.replace(" ", "_") unaligned_file.write(">" + description + "\n") unaligned_file.write(str(record.seq) + "\n\n") unaligned_file.close() print("Making alignment with MAFFT...") try: from Bio.Align.Applications import MafftCommandline mafft_cline = MafftCommandline(input=gene.name + ".fasta") mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) print(str(mafft_cline)) stdout, stderr = mafft_cline() print("Writing alignment to FASTA file...") with open("aligned_" + gene.name + ".fasta", "w") as handle: handle.write(stdout) except: print("Problem finding MAFFT, alignment skipped.") print("\nGenerating summary results spreadsheet...\n") summary = open("result.csv", "w") header = "taxon," for gene in genes: header += gene.name + "," summary.write(header + "\n") for taxon in taxa: accessions = taxon.binomial + "," for gene in genes: # each column will be the longest sequences accession record = taxon.get_longest_seq(gene.name, max_seq_length) if record != None: accessions += record.id + "," else: accessions += "," summary.write(accessions + "\n") summary.close() print("Done!\n")
def test_Mafft_with_complex_command_line(self): """Round-trip with complex command line.""" cmdline = MafftCommandline(mafft_exe) cmdline.set_parameter("input", self.infile1) cmdline.set_parameter("--localpair", True) cmdline.set_parameter("--weighti", 4.2) cmdline.set_parameter("retree", 5) cmdline.set_parameter("maxiterate", 200) cmdline.set_parameter("--nofft", True) cmdline.set_parameter("op", 2.04) cmdline.set_parameter("--ep", 0.51) cmdline.set_parameter("--lop", 0.233) cmdline.set_parameter("lep", 0.2) cmdline.set_parameter("--reorder", True) cmdline.set_parameter("--treeout", True) cmdline.set_parameter("nuc", True) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(result.return_code, 0) self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680")) self.assert_("$#=0" not in stderr.read()) self.assertEqual(str(result._cl), mafft_exe \ + " --localpair --weighti 4.2 --retree 5 " \ + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \ + " --lop 0.233 --lep 0.2 --reorder --treeout" \ + " --nuc Fasta/f002")
atpA_records.append(SeqIO.read(handle, 'fasta')) handle.close() sleep(0.02) SeqIO.write(atpA_records, "atpA_unaligned.fasta", "fasta") for accession in rbcL_accessions: if accession.strip() != '': handle = Entrez.efetch(db='nucleotide', rettype='fasta', retmode='text', id=accession) rbcL_records.append(SeqIO.read(handle, 'fasta')) handle.close() sleep(0.02) SeqIO.write(rbcL_records, "rbcL_unaligned.fasta", "fasta") print("Aligning atpA with MAFFT...") mafft_cline = MafftCommandline(input="atpA_unaligned.fasta") mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) print(str(mafft_cline)) stdout, stderr = mafft_cline() print("Writing atpA alignment to FASTA file...") with open("atpA_aligned.fasta", "w") as handle: handle.write(stdout) print("Aligning rbcL with MAFFT...") mafft_cline = MafftCommandline(input="rbcL_unaligned.fasta") mafft_cline.set_parameter("--auto", True) mafft_cline.set_parameter("--adjustdirection", True) print(str(mafft_cline)) stdout, stderr = mafft_cline()