def check_convert(in_filename, in_format, out_format, alphabet=None): # Write it out using parse/write handle = StringIO() aligns = list(AlignIO.parse(in_filename, in_format, None, alphabet)) try: count = AlignIO.write(aligns, handle, out_format) except ValueError: count = 0 # Write it out using convert passing filename and handle handle2 = StringIO() try: count2 = AlignIO.convert(in_filename, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue() # Write it out using convert passing handle and handle handle2 = StringIO() try: with open(in_filename) as handle1: count2 = AlignIO.convert(handle1, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue()
def create_msa(fasta_infile, msa_fasta,msa_phy): "Creates a multiple sequence alignment with mafft in phylip format" mafft_cline = MafftCommandline(input=fasta_infile) #Create mafft command line stdout,stderr = mafft_cline() #save mafft output into variable with open(msa_fasta, 'w') as handle: handle.write(stdout) #write mafft output in fasta format AlignIO.convert(msa_fasta,"fasta", msa_phy, "phylip-relaxed") #convert mafft output from fasta to phylip
def main(): if len (sys.argv) != 4 : print "Please provide file, the file format, and the desired file format " sys.exit (1) else: f = sys.argv[1] fout = "".join(f.split('.')[:-1]) formatin = sys.argv[2] formatout = sys.argv[3] if formatout == 'nexus': AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna) if formatout == 'mega': handle = open(f, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed")) handle.close() outfile = open(fout+'.'+formatout,'w') outfile.write('#mega'+"\n") outfile.write('!Title Mytitle;'+"\n") outfile.write('!Format DataType=DNA indel=-;'+"\n\n") for n in record_dict: outfile.write('#'+n+"\n") newseq=wrap(str(record_dict[n].seq),60) for s in newseq: outfile.write(s+"\n") outfile.close() else: AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
def muscle2phy(): #conversao de ficheiros tipo clustal para phylip de forma a obter posteriormente uma arvore de filogenia try: lista = interesting_list for i in range(len(lista)): AlignIO.convert(("Malign" + str(i+1) + ".aln"), "clustal", ("Malign" + str(i+1) + ".phy"), "phylip-relaxed") print("All MultiAlignments converted!") except: print("Converting Error!")
def convert(arg): nexus_files = [] for i in arg.input: nex = i + '.nexus' AlignIO.convert(i, 'fasta', nex, 'nexus', alphabet=IUPAC.ambiguous_dna) nexus_files.append(nex) arg.input = nexus_files return arg
def main(argv): usage = 'ConvertAln -i <infile> -x <informat> -o <outfile> -f <outformat>' infile = '' informat = '' outfile = '' outformat = '' try: opts, args = getopt.getopt(argv,"hi:x:o:f:",["infile=", "informat=", "outfile=", "outformat="]) except getopt.GetoptError: sys.exit(usage) for opt, arg in opts: if opt == '-h': print usage sys.exit() elif opt in ("-i", "--infile"): infile = arg elif opt in ("-x", "--informat"): informat = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-f", "--outformat"): outformat = arg if not infile: sys.exit("must specify infile! %s" % usage) if not outformat: sys.exit("must specify format to convert to! %s" % usage) if not informat: informat = guess_format(infile) if not outfile: if '.' in infile: outfile = '.'.join((infile.split('.')[:-1] + [get_extension(outformat)])) else: outfile = '.'.join((infile, get_extension(outformat))) if infile == 'pipe' or infile == 'stdin' or infile == 'STDIN' or infile == '|': infile = sys.stdin if outformat == 'phylip': alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna) alignment = remove_blank(alignment) if len(alignment) == 0 or len(alignment[0]) == 0: sys.exit() if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>': write_phylip(alignment, sys.stdout) else: out_fh = open(outfile, 'w') write_phylip(alignment, out_fh) out_fh.close() else: if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>': outfile = sys.stdout if outformat == 'nexus': alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna) write_nexus(alignment, outfile) else: AlignIO.convert(infile, informat, outfile, outformat, alphabet=IUPAC.ambiguous_dna)
def get_alignment(pfam_id,my_db): outpt_fname = alignment_folder+'/%s'%pfam_id if not(os.path.isfile(outpt_fname+".fasta.gz")): print "Saving alignment for", pfam_id print "" get_pfam_alignment_by_id(pfam_id=pfam_id, outpt_fname=outpt_fname+".sth",db=my_db) AlignIO.convert(outpt_fname+".sth","stockholm",outpt_fname+".fasta","fasta") if os.path.exists('%s.fasta.gz'%(outpt_fname)): subprocess.check_call("rm %s.fasta.gz"%(outpt_fname),shell=True) subprocess.check_call("gzip %s.fasta"%(outpt_fname),shell=True)
def align(fas, clean): if not os.path.exists( 'aln/' + fas +".aln") or clean: cmdline = MuscleCommandline(input='fas/' + fas, out='aln/' + fas + ".aln", clw=True) print(str(cmdline) + '\n') cmdline() try: AlignIO.convert( 'aln/' + fas +".aln", "clustal", 'phy/' + fas + ".phy", "phylip") except Exception as e : print 'WARNING: BAD ALIGNMENT' print e
def realign(msa, algorithm): with closing(StringIO()) as f_tmp: count = AlignIO.write(msa, f_tmp, "fasta") msa = f_tmp.getvalue() msa = algorithms[algorithm](msa) with closing(StringIO()) as f_out: with closing(StringIO(msa)) as f_in: count = AlignIO.convert(f_in, "fasta", f_out, "stockholm") msa = f_out.getvalue() if count else "" return msa
def generate_npbs(path, i): c = treeCl.Collection(input_dir=path, file_format='phylip') working_dir = get_dirs(path, i)['wdir'] # Check if work already done work_done = True for rec in c: looking_for = '{}.phy'.format(os.path.join(working_dir, rec.name)) if not (os.path.exists(looking_for) and os.path.getsize(looking_for) > 0): if not (os.path.exists(looking_for + '.bz2') and os.path.getsize(looking_for + '.bz2') > 0): logger.error("File not found or is empty: {}".format(looking_for)) work_done = False if not work_done: npbs = c.permuted_copy() if not os.path.exists(working_dir): os.mkdir(working_dir) for rec in npbs: rec.write_alignment('{}.phy'.format(os.path.join(working_dir, rec.name)), 'phylip', True) AlignIO.convert('{}.phy'.format(os.path.join(working_dir, rec.name)), 'phylip-relaxed', '{}.phy_'.format(os.path.join(working_dir, rec.name)), 'phylip-relaxed') os.system('mv {} {}'.format('{}.phy_'.format(os.path.join(working_dir, rec.name)), '{}.phy'.format(os.path.join(working_dir, rec.name))))
def clustalw(inputseqfile, outputmsafile): """Make a multiple sequence alignment with clustalw""" clustalw = "/usr/bin/clustalw" clustalw_cline = ClustalwCommandline(clustalw, infile=inputseqfile) stdout, stderr = clustalw_cline() outf = inputseqfile.split(".")[0] outff = outf + ".aln" align = AlignIO.read(outff, "clustal") align = AlignIO.convert(outff, "clustal", outputmsafile, "fasta")
from Bio.Alphabet import IUPAC from Bio.Seq import Seq seq1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW" seq2 = "MH--IFIYQIGYALKSGYIQSIRSPEY-NW" seq3 = "MHQAIFI-QIGYALKSGY-QSIRSPEYDNW" seqr1 = SeqRecord(Seq(seq1, Alphabet.Gapped(IUPAC.protein)), id="seq1") seqr2 = SeqRecord(Seq(seq2, Alphabet.Gapped(IUPAC.protein)), id="seq2") seqr3 = SeqRecord(Seq(seq3, Alphabet.Gapped(IUPAC.protein)), id="seq3") alin = MultipleSeqAlignment([seqr1, seqr2, seqr3]) print(alin) print(alin[1]) # 2nd sequence print(alin[:, 2]) # 3rd column print(alin[:, 3:7]) # 4th to 7th columns (all sequences) print(alin[0].seq[:3]) # first 3 columns of seq1 print(alin[1:3, 5:12]) # sequences 2 and 3; 4th to 10th column from Bio import AlignIO alin2 = AlignIO.read("PF05371_seed.aln", "clustal") print("Size:", alin2.get_alignment_length()) for record in alin2: print(record.seq, record.id) AlignIO.write(alin2, "example_alin.fasta", "fasta") AlignIO.convert("PF05371_seed.aln", "clustal", "example_alin.fasta", "fasta")
for i in files: print("\n" + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " ::: Aligning, Trimming, and Inferring Phylogeny for " + i + " :::\n") gene = i.split(".")[0] sp.call("mafft " + i + " > " + i + ".aln", shell=True, stdout=sp.DEVNULL, stderr=sp.DEVNULL) sp.call("trimal -in " + i + ".aln -out " + i + ".trim -automated1", shell=True, stdout=sp.DEVNULL, stderr=sp.DEVNULL) AlignIO.convert(i + ".trim", "fasta", gene + ".nex", 'nexus', alphabet=IUPAC.ambiguous_dna) ############################################### RUNNING IQTREE FOR EACH GENE & PRINT RESULTS sp.call("iqtree -s " + i + ".trim -bb 1000 -seed 12345", shell=True, stdout=sp.DEVNULL, stderr=sp.DEVNULL) if os.path.isfile(i + ".trim.contree"): t = dendropy.Tree.get_from_path(i + ".trim.contree", schema='newick', preserve_underscores=True) d = t.phylogenetic_distance_matrix() d.write_csv(i + ".phylodist.tsv")
def test_bootstrap_AlignIO_protein(self): """Pseudosample a phylip protein alignment written with AlignIO.""" n = AlignIO.convert("Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip") self.check_bootstrap("Phylip/hedgehog.phy", "phylip", "p")
def write_AlignIO_protein(): """Convert hedgehog.aln to a phylip file""" assert 1 == AlignIO.convert("Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip")
# -*- coding: utf-8 -*- from Bio import AlignIO import argparse parser = argparse.ArgumentParser() parser.add_argument("-input", action=store, dest=input, type=str) parser.add_argument("-output", action=store, dest=output, type=str) args = parser.parse_args() AlignIO.convert(args.input, "clustal", args.output, "phylip-relaxed")
from Bio import AlignIO from Bio import Alphabet import sys, os if __name__ == "__main__": if len(sys.argv) != 5: print "usage: python seqformat_converter.py inDIR outDIR inputFormat(.fa/.phy/.nex) outputFormat(.fa/.phy/.nex)" sys.exit() inDIR = sys.argv[1]+"/" outDIR = sys.argv[2]+"/" inputFormat = sys.argv[3] outputFormat = sys.argv[4] for i in os.listdir(inDIR): if not i.endswith(inputFormat): continue clusterID = i.split(inputFormat)[0] if inputFormat == '.fa' and outputFormat == '.phy': AlignIO.convert(inDIR+i, "fasta", outDIR+clusterID+".phy", "phylip-sequential", alphabet=Alphabet.generic_dna) elif inputFormat == '.phy' and outputFormat == '.fa': AlignIO.convert(inDIR+i, "phylip-sequential", outDIR+clusterID+".fa", "fasta", alphabet=Alphabet.generic_dna) elif inputFormat == '.phy' and outputFormat == '.nex': AlignIO.convert(inDIR+i, "phylip-sequential", outDIR+clusterID+".nex", "nexus", alphabet=Alphabet.generic_dna) elif inputFormat == '.fa' and outputFormat == '.nex': AlignIO.convert(inDIR+i, "fasta", outDIR+clusterID+".nex", "nexus", alphabet=Alphabet.generic_dna)
def convertFasta2Phylip(input_fasta, output_prefix, format='phylip-sequential'): """ convert input_fasta to phylip-relaxed format using Biopython """ AlignIO.convert(input_fasta, 'fasta', output_prefix + ".phy", format)
print >>out, d, x, "does not fall within the linear genome range" else: if str(Seq(data[d][x][1]).translate()) != str(data[d][x][2]): print >>out, d, x, "the translation does not match" else: for e2bs in data[d][x]: if int(e2bs.split("..")[1]) not in range(0, len(data[d]["CG"][1])+1): print >>log, d, x, e2bs, "does not fall within the linear genome range" accessions=[] QC=test_start_stop("./results/results.csv") if QC!=None: print >>log, QC os.system("mafft --add ./results/L1.fas --quiet --reorder ./files/all_L1.mafft.fas > output.fas") AlignIO.convert("output.fas", "fasta", "output.phy", "phylip") os.system("phyml output.phy 0 i 1 0 GTR 4.0 e 1 1.0 BIONJ n n") new_L1 = [] for seq_record in SeqIO.parse("./results/L1.fas", "fasta"): new_L1.append(seq_record.id) for nL1 in new_L1: presence("output.fas", nL1, log) #if doing more than a single record at a time, this needs to be edited to get a list of accession numbers and feed these into the function print 'all done' onlyfiles = [ f for f in listdir("./") if isfile(join("./",f)) ]
def alignSeqs(sequencedict): clustalfh = open('clustal_alignments.aln', 'w') ''' stockholmfh = open('stockholm_alignments.aln', 'w') ''' UTRfastasfh = open('UTRfastas.fa', 'w') clustalfh.close() ''' stockholmfh.close() ''' UTRfastasfh.close() if os.path.exists('./StockholmAlignments/') == False: os.mkdir('./StockholmAlignments') for UTR in sequencedict: UTRID = str(UTR) #Write fasta file from dictionary entry fastafh = open('temp.fasta', 'w') fastastring = '' for species in sequencedict[UTR]: fastastring += '>' + str(species.keys()[0]) + '\n' + str(species.values()[0]) + '\n' fastafh.write(fastastring) fastafh.close() tempfastafh = open('temp.fasta', 'r') tempfastalines = [] for line in tempfastafh: tempfastalines.append(line) tempfastafh.close() #Align fasta using clustalw cline = ClustalwCommandline('clustalw2', infile = 'temp.fasta') cline() #alignment now in temp.aln clustallines = [] tempclustalfh = open('temp.aln', 'r') for line in tempclustalfh: clustallines.append(line) tempclustalfh.close() #Convert clustal to stockholm AlignIO.convert('temp.aln', 'clustal', 'tempstockholm.aln', 'stockholm') #Get secondary structure line from RNAalifold ss = subprocess.check_output(['RNAalifold', 'temp.aln']).replace(' ', '\n', 1).split('\n')[-3] ssline = '#=GC SS_cons ' + ss + '\n' + '//' + '\n' #Replace '//' in stockholm file with secondary structure line replace_in_file.replace('tempstockholm.aln', '//', ssline) #Add ID line to file. This is necessary for Infernal. titleline = '# STOCKHOLM 1.0' IDline = '#=GF ID ' + UTRID replacement = titleline + '\n' + IDline + '\n' replace_in_file.replace('tempstockholm.aln', '# STOCKHOLM 1.0', replacement) #Rename stockholm file os.rename('tempstockholm.aln', './StockholmAlignments/' + UTRID + '.aln') #Now making many small stockholm files instead of one big one. ''' tempstockholmfh = open('tempstockholm.aln', 'r') stockholmlines = [] for line in tempstockholmfh: stockholmlines.append(line) tempstockholmfh.close() ''' #Append current temp aln files to their respective alignment files with open('clustal_alignments.aln', 'a') as clustalfile: for line in clustallines: clustalfile.write(line) #Now making many small stockholm files instead of one big one. ''' with open('stockholm_alignments.aln', 'a') as stockholmfile: for line in stockholmlines: stockholmfile.write(line) ''' with open('UTRfastas.fa', 'a') as UTRfastafile: for line in tempfastalines: UTRfastafile.write(line) UTRfastafile.write('\n' + '\n' + '\n') #Cleanup os.remove('alirna.ps') os.remove('temp.aln') os.remove('temp.dnd') os.remove('temp.fasta')
@author: Shaurita D. Hutchins """ # Part 3: Use multiple sequence alignments in phylip format to create phylogenetic trees. # Mark start of program with printed text description/title. print( "\n" + (81 * "#") + "\n" + "#### Part 3: Use multiple sequence alignments to create phylogenetic trees. ####" + "\n" + (81 * "#") + "\n") # List of modules imported. from Bio import Phylo from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio import AlignIO #alignment = AlignIO.read("HTR1E_aligned.phy", "phylip") #print(alignment) #print("\n") #for record in alignment: # print(record.seq + " " + record.id + "\n") #calculator = DistanceCalculator('identity') #dm = calculator.get_distance(alignment) #print(dm) x = AlignIO.convert("HTR1E_aligned.fasta", "fasta", "HTR1E_aligned.phy", "phylip-relaxed") print(x) #tree = Phylo.read('outtree.txt', 'newick') #tree.ladderize() # Flip branches so deeper clades are displayed at top #Phylo.draw(tree)
output = args.output delim = args.delimiter ############################################### CODE print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " ::: Converting alignments to NEXUS and creating partitions :::") files = sorted(glob.glob(input + '/*')) sp.call("mkdir -p " + output + "/nexus", shell=True) x = 1 partitions = open(output + "/partitions.txt", "w") for file in tqdm(files): filename = os.path.basename(file) lociname = filename.split(delim)[0] outfile = output + '/nexus/' + lociname + '.nex' AlignIO.convert(file, type, outfile, 'nexus', alphabet=IUPAC.ambiguous_dna) aln = AlignIO.read(file, type) partitions.write(lociname + " = " + str(x) + "-" + str(x - 1 + aln.get_alignment_length()) + ";") partitions.write("\n") x = x + aln.get_alignment_length() partitions.close() print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " ::: Concatenating NEXUS alignments :::") loci = sorted([ os.path.basename(x).split(".nex")[0] for x in glob.glob(output + "/nexus/*.nex") ]) nexi = [(locus, Nexus.Nexus(output + "/nexus/" + locus + ".nex"))
#usage #python fasta.to.stockholm.py <input.file.fasta> <output.file.sto> # 0 1 2 import sys from Bio import AlignIO #input filein = open(sys.argv[1], "r") #outputs fileout = open(sys.argv[2], 'w') AlignIO.convert(filein, "fasta", fileout, "stockholm")
assert os.path.isfile(clustalw_exe), "Clustal W executable missing" stdout, stderr = cline() # %% convert alignment file from fasta from muscle in Poseidon to phyli-relaxed msaFile = "/muscleAlignmentCoralSeq.msa" outFile = "/muscleAlignmentCoralSeq.phy" import os path = os.getcwd() #path = "/Users/kgrabb/Documents/2018.05CoralLarvae/Genomes/Poseidon/blastResults/v2" print(path) print(path + msaFile) inputFile = path + msaFile outputFile = path + outFile viewFile = pd.read_csv(inputFile) print(viewFile.head(5)) AlignIO.convert(inputFile, "fasta", outputFile, "phylip-relaxed") # %% use PhyML. feed in phy alignment with the command line wrapper from Bio.Phylo.Applications import PhymlCommandline cmdline = PhymlCommandline(input=outputFile, datatype="aa", model="WAG", alpha="e", bootstrap=100) out_log, err_log = cmdline() # %%
def add_pplaced(pfam_id): if pfam_id in pplacer_queries.keys(): print "Running PPlacer for: %s"%pfam_id pplace_log=pplacer_folder+'/%s_pplace_log.txt'%pfam_id Already_placed=[] if os.path.exists(pplace_log): with open(pplace_log, "r") as myfile: for line in myfile: line=line.strip() if not line: continue line=line.split('\t') if not len(line)==2: continue Already_placed.extend(line[1].split(',')) Sequnces=[] p_ids=[] for new_gene in pplacer_queries[pfam_id]: p_id = new_gene['pplacer_id'] if p_id in Already_placed: continue p_ids.append(p_id) p_seq = gene_seq[new_gene['id']][(new_gene['seq_start']-1):new_gene['seq_end']] Sequnces.append(SeqRecord(p_seq, id=p_id)) if not p_ids: print "All %s domains for family %s have already been pplaced."%(len(Already_placed),pfam_id) return rand_id_1=random.randint(1000000,9999999) rand_id_2=random.randint(1000000,9999999) rand_id_3=random.randint(1000000,9999999) subprocess.check_call("gunzip -c %s/%s.log.gz > %s/%s.log.%d"%(tree_folder,pfam_id,tree_folder,pfam_id,rand_id_1),shell=True) subprocess.check_call("gunzip -c %s/%s.nw.gz > %s/%s.nw.%d"%(tree_folder,pfam_id,tree_folder,pfam_id,rand_id_2),shell=True) subprocess.check_call("gunzip -c %s/%s.fasta.gz > %s/%s.fasta.%d"%(alignment_folder,pfam_id,alignment_folder,pfam_id,rand_id_3),shell=True) AlignIO.convert("%s/%s.fasta.%d"%(alignment_folder,pfam_id,rand_id_3),"fasta","%s/%s.sth.%d"%(alignment_folder,pfam_id,rand_id_3),"stockholm") hmm_build(hmmbuild_executable_loc=path_to_hmmbuild, sequence_file='%s/%s.sth.%d'%(alignment_folder,pfam_id,rand_id_3), output_file='%s/%s.hmm'%(pplacer_folder,pfam_id)) taxit_create(taxit_executable_loc=path_to_taxit, aln_fasta='%s/%s.fasta.%d'%(alignment_folder,pfam_id,rand_id_3), hmm_file='%s/%s.hmm'%(pplacer_folder,pfam_id), tree_file='%s/%s.nw.%d'%(tree_folder,pfam_id,rand_id_2), tree_stats='%s/%s.log.%d'%(tree_folder,pfam_id,rand_id_1), pfam_acc=pfam_id, output_location='%s/%s_pplacer'%(pplacer_folder,pfam_id), aln_stockholm='%s/%s_pplacer/%s.sto.%d'%(pplacer_folder,pfam_id,pfam_id,rand_id_3), ) if os.path.exists("%s/%s.log.%d"%(tree_folder,pfam_id,rand_id_1)): subprocess.check_call("rm %s/%s.log.%d"%(tree_folder,pfam_id,rand_id_1),shell=True) if os.path.exists("%s/%s.nw.%d"%(tree_folder,pfam_id,rand_id_2)): subprocess.check_call("rm %s/%s.nw.%d"%(tree_folder,pfam_id,rand_id_2),shell=True) if os.path.exists("%s/%s.fasta.%d"%(alignment_folder,pfam_id,rand_id_3)): subprocess.check_call("rm %s/%s.fasta.%d"%(alignment_folder,pfam_id,rand_id_3),shell=True) if os.path.exists("%s/%s.sth.%d"%(alignment_folder,pfam_id,rand_id_3)): subprocess.check_call("rm %s/%s.sth.%d"%(alignment_folder,pfam_id,rand_id_3),shell=True) output_prefix = '%s/%s_pplaced'%(pplacer_folder,pfam_id) updated_aln = output_prefix + '.sto' jplace_output_file = output_prefix + '.jplace' tree_output_file = output_prefix + '.tre' sequence_file='%s/%s.sth'%(alignment_folder,pfam_id) aln_fasta='%s/%s.fasta'%(alignment_folder,pfam_id) tree_file='%s/%s.nw'%(tree_folder,pfam_id) pplacer_pkg_dir ='%s/%s_pplacer'%(pplacer_folder,pfam_id) pplacer_pkg_hmm = '%s/%s.hmm'%(pplacer_pkg_dir,pfam_id) pplacer_pkg_aln = '%s/%s.sto.%d'%(pplacer_pkg_dir,pfam_id,rand_id_3) tmpf='%s/%s.tmpf'%(pplacer_pkg_dir,pfam_id) # Update alignment to include the query sequence for the hypothetical domain. aln_res = update_hmmer_alignment(Sequnces, orig_alignment=pplacer_pkg_aln, hmm=pplacer_pkg_hmm,tmpf=tmpf) aln_out = open(updated_aln,'w') AlignIO.write(aln_res[0], aln_out, 'stockholm') aln_out.close() # Call pplacer to generate placements onto the tree. pplaced = pplacer_call(pplacer_package=pplacer_pkg_dir, aln_file=updated_aln, jplace_output_file=jplace_output_file) # Use the "guppy" tool to generate the best-placement tree with query as a leaf. gt = guppy_tree(jplace_file=jplace_output_file, tree_output_file=tree_output_file) #Phylo.convert(tree_output_file, 'newick', tree_output_file_xml, 'phyloxml') os.system('rm -rf %s'%(pplacer_pkg_dir)) os.system('rm %s/%s.hmm'%(pplacer_folder,pfam_id)) os.system('rm %s/%s_pplaced.jplace'%(pplacer_folder,pfam_id)) os.system('mv %s %s'%(updated_aln,sequence_file)) AlignIO.convert(sequence_file,"stockholm",aln_fasta,"fasta") if os.path.exists(aln_fasta+'.gz'): subprocess.check_call("rm %s.gz"%(aln_fasta),shell=True) subprocess.check_call("gzip %s"%(aln_fasta),shell=True) cmd='mv %s %s'%(tree_output_file,tree_file) os.system(cmd) if os.path.exists(tree_file+'.gz'): subprocess.check_call("rm %s.gz"%(tree_file),shell=True) subprocess.check_call("gzip %s"%(tree_file),shell=True) with open(pplace_log, "a") as myfile: myfile.write("%s\t%s\n"%(my_sequence_file,','.join(p_ids)))
concatenate_seqs(out_trimed_peps, tree_input_fasta) os.chdir(out_trees) print(""" @@@@@@@@@@@@@@ 建树软件选择 @@@@@@@@@@@@@@ [fasttree] 使用fasttree 建树,参数为 '-lg -gamma' [raxml] 使用raxmlHPC-PTHREADS-SSE3建树, 参数为 '-f a -n tre -m PROTGAMMALGX -x 1000 -# 1000 -p 1000 -T 40' 后续版本会提供更多参数选择 [q] 退出程序,自定义建树 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ """) tree_software = '' while tree_software not in ['fasttree', 'raxml', 'q']: tree_software = input("请输入 fasttree/raxml/q 进行选择---> ") if tree_software == 'fasttree': fasttreeCMD = [ 'fasttree -lg -gamma {} > lg_gamma.tree'.format(tree_input_fasta) ] sp.call(fasttreeCMD, shell=True) elif tree_software == 'raxml': from Bio import AlignIO AlignIO.convert("tree_input.fasta", "fasta", "tree_input.phy", "phylip-relaxed") raxmlCMD = [ 'raxmlHPC-PTHREADS-SSE3 -f a -s tree_input.phy -n tre -m PROTGAMMALGX -x 1000 -# 1000 -p 1000 -T 40' ] sp.call(raxmlCMD, shell=True) elif tree_software == 'q': print('运行结束,祝愉快!') sys.exit()
from Bio import AlignIO count = AlignIO.convert("/home/koreanraichu/agrobacterium.fasta", "fasta", "/home/koreanraichu/agrobacterium.sth", "stockholm") print("Converted %i alignments" % count) # Stockholm alignments = AlignIO.parse("/home/koreanraichu/enterobacter.fasta", "fasta") count = AlignIO.write(alignments, "/home/koreanraichu/enterobacter.aln","clustal") print("Converted %i alignments" % count) # ClustalW alignment = AlignIO.read("/home/koreanraichu/PF00096_seed.txt", "stockholm") AlignIO.write([alignment], "/home/koreanraichu/PF00096_seed.aln", "clustal") print("Converted %i alignments" % count) # read 후 리스트화해서 변환(clustalW) count2 = AlignIO.convert("/home/koreanraichu/PF08449_seed.txt", "stockholm", "/home/koreanraichu/PF08449_seed.phy", "phylip") print("Converted %i alignments" % count) # 이거라면 필립 될거같은데? alignment2 = AlignIO.read("/home/koreanraichu/PF08449_seed.txt", "stockholm") name_mapping = {} for i, record in enumerate(alignment): name_mapping[i] = record.id record.id = "seq%i" % i AlignIO.write([alignment], "/home/koreanraichu/PF08449_seed_ID.phy", "phylip") # 오 뭔진 모르겠지만 ID가 숫자가 된 건가
#!/usr/bin/env python from Bio import AlignIO import csv import argparse import pprint import sys parser = argparse.ArgumentParser(description="convert msa file") parser.add_argument('msa_file', help="multiple sequence alignment (MSA) file", type=argparse.FileType('r')) parser.add_argument('-i', '--informat', default="clustal", help="MSA format") parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), help="MSA output file", required=True) parser.add_argument('-v', '--outformat', default="clustal", help="MSA output format") args = parser.parse_args() AlignIO.convert(args.msa_file, args.informat, args.outfile, args.outformat)
#usage #python fasta.to.stockholm.py <input.file.fasta> <output.file.sto> # 0 1 2 import sys from Bio import AlignIO #input filein=open(sys.argv[1],"r") #outputs fileout=open(sys.argv[2],'w') AlignIO.convert(filein,"stockholm",fileout,"fasta")
def main(fasta, phylip): AlignIO.convert("%s" % fasta, "fasta", "%s" % phylip, "phylip-sequential")
def convertFastaToClustal(in_dir, out_dir): for in_path in glob(os.path.join(in_dir, '*.fasta')): out_fn = os.path.basename(in_path).replace('.fasta', '.aln') out_path = os.path.join(out_dir, out_fn) AlignIO.convert(in_path, 'fasta', out_path, 'clustal')
def _phyml(exe, msa, model, cat, gamma, alpha, freq, invp, start_tree, constraint_tree, seed, outfile): """ Infer ML phylogenetic tree using PhyML. """ # cmd = 'exe -i seq -d aa -m JTT -f e|m -v invariable -c 4 -a gamma-alpha # --quiet --r_seed num -u user_tree_file' wd = tempfile.mkdtemp(dir=os.path.dirname(os.path.abspath(msa))) alignment = 'temporary.alignment.phylip' AlignIO.convert(msa, 'fasta', os.path.join(wd, alignment), 'phylip') if model.type == 'builtin': m = ['-m', model.name] if model.name else ['-m', 'LG'] else: m = ['-m', 'custom', '--aa_rate_file', model.name] info('Inferring ML tree for {} using PhyML.'.format(msa)) args = [exe, '-i', alignment, '-d', 'aa', '--r_seed', str(seed), '--quiet'] args.extend(m) cat = cat if cat not in ('None', 'none', None) else 0 if cat: args.extend(['--free_rates', cat]) gamma = gamma or model.gamma if gamma: args.extend(['-c', str(gamma)]) if alpha: args.extend(['-a', str(alpha)]) frequency = freq or model.frequency frequency = 'X' if frequency == 'estimate' else 'F' if frequency == 'estimate': args.extend(['-f', 'e']) else: args.extend(['-f', 'm']) if start_tree: args.extend(['-u', start_tree]) if constraint_tree: args.extend(['--constraint_file', constraint_tree]) if model.invp: if invp: args.extend(['-v', str(invp)]) else: args.extend(['-v', 'e']) else: if invp: args.extend(['-v', str(invp)]) try: # info('Running FastTree using the following command:\n\t' # '{}'.format(' '.join(args))) process = Popen(args, cwd=wd, stdout=PIPE, stderr=PIPE, universal_newlines=True) code = process.wait() if code: msg = process.stderr.read() or process.stdout.read() msg = indent(msg, prefix='\t') error('Tree inferring failed for {}\n{}'.format(msa, msg)) sys.exit(1) else: tree = outfile if outfile else '{}.PhyML.ML.newick'.format( basename(msa)) try: out = '{}{}'.format(os.path.join(wd, alignment), '_phyml_tree.txt') tree = shutil.copy(out, tree) info('Successfully save inferred ML tree to {}.'.format( tree)) except OSError: error('Path of outfile {} is not writeable, saving tree to ' 'file failed.'.format(tree)) sys.exit(1) except OSError: error('Tree inferring failed for {}, executable (exe) {} of PhyML ' 'is invalid.'.format(msa, exe)) sys.exit(1) finally: shutil.rmtree(wd) return tree
# output will be added to the same directory inFile = sys.argv[1] # if input specified without a final "/", add one if not re.search("/$", inFile): inFile = inFile + "/" # check input files present in dir assert os.path.exists(inFile + "populations.all.phylip") and os.path.exists( inFile + "populations.var.phylip"), "Input files not present" # convert files # all sites to sequential phylip print("Converting to all sites phylip to sequential phylip") AlignIO.convert(inFile + "populations.all.phylip", "phylip", inFile + "populations.all.seq.phylip", "phylip-sequential") # all sites to fasta print("Converting to all sites phylip to fasta") AlignIO.convert(inFile + "populations.all.phylip", "phylip", inFile + "populations.all.fasta", "fasta") # all sites to nexus print("Converting to all sites phylip to nexus") AlignIO.convert(inFile + "populations.all.phylip", "phylip", inFile + "populations.all.nexus", "nexus", "DNA") # var sites to nexus print("Converting to variable sites phylip to nexus") AlignIO.convert(inFile + "populations.var.phylip", "phylip-relaxed", inFile + "populations.var.nexus", "nexus", "DNA")
def main(): args = get_para() AlignIO.convert(args.infile, args.input_format, args.outfile, args.output_format)
def build_tree(msa_file, original, file_name="tree", bootstrap=10): """ Build a phylogenetic tree based on a multiple sequence alignment. http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc217 PhyML site: http://www.atgc-montpellier.fr/ "New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing the Performance of PhyML 3.0." Guindon S., Dufayard J.F., Lefort V., Anisimova M., Hordijk W., Gascuel O. Systematic Biology, 59(3):307-21, 2010. Color based on novelty """ # See https://biopython.org/wiki/Phylo for general code and settings # Convert to phyllip format tree_file = file_name # for convenience AlignIO.convert(msa_file, "clustal", tree_file, "phylip-relaxed", alphabet=IUPAC.protein) # NOTE: make sure file name is "phyml" PhyML = PhymlCommandline("./PhyML") PhyML.input = tree_file PhyML.datatype = 'aa' # Specify that amino acids are being input PhyML.model = 'LG' # Amino acid substitution matrix PhyML.alpha = 'e' # non-parametric bootstrap relplicates; 100 is point of dimiishing returns PhyML.bootstrap = bootstrap # Run tree generation, print success/failure print("Building distance tree from multiple sequence alignment...\n") stdout, stderr = PhyML() print(stdout + stderr) print(f"Newick tree saved as {tree_file + '_phyml_tree.txt'}") # Read in tree file, convert to XML (to be able to add color, etc.) tree = Phylo.read(tree_file + "_phyml_tree.txt", "newick") tree = tree.as_phyloxml() # Stylize the tree # Colorblind-safe colors can be checked with ColorOracle: http://colororacle.org/ for clade in tree.find_clades(): # Bold lines clade.width = 3 # Red if known gene or false positive if str(clade.name).startswith("gi|"): clade.color = "#e4002b" # Blue for originally searched gene elif clade.name == original.id: clade.color = "#006db6" # Black for comparitor nodes elif clade.name is not None and not clade.color: clade.color = "#000000" # Gray for non-terminal nodes elif not clade.name: clade.color = "#63666a" # Green for novel genes if str(clade.name).endswith("***"): clade.color = "#00bf71" # Configure plot. Image size determined based on number of nodes tree_len = len(tree.get_terminals()) plt.rc("font", size=18) # Bigger font for easier reading fig = plt.figure(figsize=(1.6 * tree_len, tree_len), dpi=300) axes = fig.add_subplot(1, 1, 1) Phylo.draw(tree, axes=axes, do_show=False) # Save white background image fig.savefig(f"{tree_file}.png", format='png', bbox_inches='tight', dpi=300) # Save transparent image fig.savefig(f"{tree_file}_transparent.png", format='png', bbox_inches='tight', dpi=300, transparent=True) print(f"Tree images saved as {tree_file + '.png'} " f"and {tree_file + '_transparent.png'} to {os.getcwd()}\n")
def pairwiseDistance(self, id1, id2, method=None): """ Calculates distance between each pair by diferent methods: ClustalW distance, p-distance, Jukes-Cantor and Alignment score, with BLOSUM62 or PAM250 matrix. (edit Parameters.py) """ method = pairwise_distance align_matrix = alignscore_matrix distances1 = [] distances2 = [] input = "./Data/" + id1 + ".fasta" input_query = SeqIO.parse(input, "fasta", IUPAC.protein) for record in input_query: q_desc = str(record.description) q_seq = str(record.seq) break for entry in self.ord_sequences1: p_desc = str(entry[0]) p_seq = str(entry[1]) p_seq = p_seq.rstrip(":") p_seq = p_seq.split(":") new_rec = [] for seq in p_seq: p_new_seq = seq pair = "./Data/" + id1 + ".pair" out_pair= open(pair, "w") sequence1 = str("\n" + ">" + q_desc + "\n" + q_seq + "\n") sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n") out_pair.write(sequence1 + sequence2) out_pair.close() output_align = "./Data/" + id1 + ".aln" output_tree = "./Data/" + id1 + ".dnd" distance = "./Data/" + id1 + ".distance" clustalw = system("clustalw " + pair + " > " + distance) clustalw output_fasta = "./Data/" + id1 + "_pair.fasta" AlignIO.convert(output_align, "clustal", output_fasta, "fasta") input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) msa = [] for record in input_align: seq = str(record.seq) msa.append(seq) sequence1 = msa[0] sequence2 = msa[1] pair_score = getDistance(sequence1, sequence2, method, align_matrix, distance) value = [pair_score, p_new_seq] new_rec.append(value) sort = sorted(new_rec, key=lambda new_rec: new_rec[0]) new_dist = sort[0][0] new_seq = sort[0][1] distances1.append(new_dist) output = "./Data/" + id1 + ".fasta" out_fasta = open(output, "a") out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n") out_fasta.close() try: remove(pair) remove(output_align) remove(output_tree) remove(output_fasta) remove(distance) except: pass input = "./Data/" + id2 + ".fasta" input_query = SeqIO.parse(input, "fasta", IUPAC.protein) for record in input_query: q_desc = str(record.description) q_seq = str(record.seq) break for entry in self.ord_sequences2: p_desc = str(entry[0]) p_seq = str(entry[1]) p_seq = p_seq.rstrip(":") p_seq = p_seq.split(":") new_rec = [] for seq in p_seq: p_new_seq = seq pair = "./Data/" + id2 + ".pair" out_pair= open(pair, "w") sequence1 = str("\n" + ">" +q_desc + "\n" + q_seq + "\n") sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n") out_pair.write(sequence1 + sequence2) out_pair.close() output_align = "./Data/" + id2 + ".aln" output_tree = "./Data/" + id2 + ".dnd" distance = "./Data/" + id2 + ".distance" clustalw = system("clustalw " + pair + " > " + distance) clustalw output_fasta = "./Data/" + id2 + "_pair.fasta" AlignIO.convert(output_align, "clustal", output_fasta, "fasta") input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) msa = [] for record in input_align: seq = str(record.seq) msa.append(seq) sequence1 = msa[0] sequence2 = msa[1] pair_score = getDistance(sequence1, sequence2, method, align_matrix, distance) value = [pair_score, p_new_seq] new_rec.append(value) sort = sorted(new_rec, key=lambda new_rec: new_rec[0]) new_dist = sort[0][0] new_seq = sort[0][1] distances2.append(new_dist) output = "./Data/" + id2 + ".fasta" out_fasta = open(output, "a") out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n") out_fasta.close() try: remove(pair) remove(output_align) remove(output_tree) remove(output_fasta) remove(distance) except: pass output = "./Data/" + "matrix.txt" out_distance = open(output, "w") for i in range(len(distances1)): print >> out_distance, "1" + "\t" + str(i+2) + "\t" + \ str(distances1[i]) + "\t" + \ str(distances2[i]) out_distance.close()
#https://www.biostars.org/p/327003/""" #from Bio import AlignIO #from Bio import SeqIO #alignments = AlignIO.parse("x.fa", "fasta") #with open("example.faa", "w") as handle: # count = SeqIO.write(alignments, handle, "fasta") from Bio import SeqIO from Bio import AlignIO #with open("example.faa", "w") as handle: # for record in AlignIO.parse("xA.fa", "fasta"): # count = SeqIO.write(record, handle, "phylip") count = AlignIO.convert("x.fa", "fasta", "example.phy", "phylip-relaxed") print("Converted %i alignments" % count)
from Bio import AlignIO import glob ########################################################################################## ## Converts a directory of sequence alignment files into another file format. ## ## ## Requirements: ## 1. Python 2.7 ## 2. Biopython ## ########################################################################################## outputDir = '' ## where output alignments should be located. inputDir = '' ## where input alignments are located. outputFormat = 'nexus' ## format of output alignments inputFormat = 'fasta' ## format on input alignments outputSuffix = '.nex' ## ending for each output file alignmentFiles = glob.glob(inputDir + "*") for f in alignmentFiles: align = AlignIO.read(f, inputFormat) print "Converting file %s..." % f AlignIO.convert(f, inputFormat, outputDir + file_name[:-3] + outputSuffix, outputFormat, alphabet=None)
def fasta2phy(fa_align, phy_file): #phy_file = '%s.phy' %(os.path.splitext(fa_align)[0]) ofile = open(phy_file, 'w') AlignIO.convert(fa_align, 'fasta', ofile, 'phylip-relaxed') ofile.close()
def computeAlignment(self, id, alignment): "Computes multiple sequence alignment with inputed method" if alignment == "clustalw": gop = LP(self.parameterfile, "clustalw_gap_opening") gep = LP(self.parameterfile, "clustalw_gap_extension") d_matrix = LP(self.parameterfile, "clustalw_distance_matrix") input_sequences = self.dirname + id + ".fasta" output_align = self.dirname + id + ".aln" output_fasta = self.dirname + id + "_clustalw.fasta" output_tree = self.dirname + id + ".dnd" try: cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe") clustalw = ClustalwCommandline(cmd, infile=input_sequences, outfile=output_align, newtree=output_tree, align="input", seqnos="ON", outorder="input", type="PROTEIN", pwmatrix=d_matrix, gapopen=gop, gapext=gep) clustalw() except: cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw") clustalw = ClustalwCommandline(cmd, infile=input_sequences, outfile=output_align, newtree=output_tree, align="input", seqnos="ON", outorder="input", type="PROTEIN", pwmatrix=d_matrix, gapopen=gop, gapext=gep) clustalw() AlignIO.convert(output_align, "clustal", output_fasta, "fasta") try: remove(output_align) remove(output_tree) except: pass elif alignment == "muscle": iteration = LP(self.parameterfile, "muscle_max_iteration") input_sequences = self.dirname + id + ".fasta" output_align = self.dirname + id + "_muscle.aln" output_fasta = self.dirname + id + "_muscle.fasta" muscle = MuscleCommandline(input=input_sequences, out=output_align, clwstrict=True, maxiters=iteration) muscle() AlignIO.convert(output_align, "clustal", output_fasta, "fasta") try: remove(output_align) except: pass organism_order = [] input_sequences = self.dirname + id + ".fasta" align = SeqIO.parse(input_sequences, "fasta", IUPAC.protein) for record in align: org = record.description organism_order.append(org) rec = dict() output_fasta = self.dirname + id + "_muscle.fasta" align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) for record in align: org = str(record.description) seq = str(record.seq) rec[org] = seq fasta = open(output_fasta, "w") fasta.close() fasta = open(output_fasta, "a") for org in (organism_order): seq = rec[org] fasta.write(">" + org + "\n" + seq + "\n") fasta.close() else: configuration = LP(self.parameterfile, "mafft_configuration") threads = LP(self.parameterfile, "mafft_threading") input_sequences = self.dirname + id + ".fasta" output_fasta = self.dirname + id + "_mafft.fasta" if configuration == "fftnsi": if threads == False: fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft else: try: threads = int(threads) fftnsi = "mafft --retree 2 --maxiterate 1000\ --inputorder --threads %i " % (threads) mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft except: fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft else: if threads == False: linsi = "mafft --localpair --maxiterate 1000 --inputorder " mafft = system(linsi + input_sequences + ">" + output_fasta) mafft else: try: threads = int(threads) linsi = "mafft --localpair --maxiterate 1000\ --inputorder --threads %i " % (threads) mafft = system(linsi + input_sequences + ">" + output_fasta) mafft except: linsi = "mafft --localpair --maxiterate 1000 --inputorder " mafft = system(linsi + input_sequences + ">" + output_fasta) mafft
try: bootstrap = sys.argv[3] except: bootstrap = raw_input("Number of bootstrap: ") try: threads = sys.argv[4] except: threads = raw_input("Number of threads: ") try: name = sys.argv[5] except: name = raw_input("Introduce_number: ") AlignIO.convert(file, "fasta", file+".phy", "phylip-relaxed") file_phy = file + ".phy" try: print "raxmlHPC-PTHREADS-AVX -T %s -m GTRCAT -p 12345 -# %s -s %s -n run1" % (threads, trees, file_phy) call("raxmlHPC-PTHREADS-AVX -T %s -m GTRCAT -p 12345 -# %s -s %s -n run1" % (threads, trees, file_phy), shell=True) except: print "IT IS NOT GOOD. PLEASE, CHECK YOUR INPUT FILE(S)" sys.exit() call("raxmlHPC-PTHREADS-AVX -T %s -m GTRCAT -p 12345 -b 12345 -# %s -s %s -n run2" % (threads, bootstrap, file_phy), shell=True) try: call("raxmlHPC -m GTRCAT -p 12345 -f b -t RAxML_bestTree.run1 -z RAxML_bootstrap.run2 -n %s.run3" % (name), shell=True) call("rm *.run1*", shell=True)
def convert_fas_to_phylip(input_file, output_file): AlignIO.convert(input_file, FASTA_FORMAT, output_file, PHYLIP_FORMAT)
import sys from Bio import SeqIO, AlignIO, Phylo from Bio.Alphabet import generic_protein, generic_dna options = sys.argv[1:] incheck = options[0] infile = options[1] outfile = options[2] intype = options[3] outtype = options[4] if incheck == 'seq': SeqIO.convert(infile, intype, outfile, outtype, generic_dna) elif incheck == 'align': AlignIO.convert(infile, intype, outfile, outtype, generic_dna) elif incheck == 'tree': Phylo.convert(infile, intype, outfile, outtype)
def convert(InfileName, OutfileName): '''Uses Biophython to convert the file ''' count = AlignIO.convert(InfileName, "nexus", OutfileName, "phylip") print("\nConverted %i alignments" % count) print("\nOutput saved as %s" % OutfileName)
def write_AlignIO_dna(): """Convert opuntia.aln to a phylip file""" assert 1 == AlignIO.convert("Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip")
def coevol(line): result={} entry = line.split("\t") bound_pro = entry[0].split("_")[0] bound_pro_c1 = entry[0].split("_")[1].split(":")[0] bound_pro_c2 = entry[0].split("_")[1].split(":")[1] unbound_pro1 = entry[1].split("_")[0] unbound_pro1_c = entry[1].split("_")[1] unbound_pro2 = entry[2].split("_")[0] unbound_pro2_c = entry[2].split("_")[1][:-1] pdb = parsePDB(bound_pro) header = parsePDBHeader(bound_pro, 'polymers') bp_chid = [] for polymer in header: bp_chid.append(polymer.chid) bp1_chid_idx = [] for c in bound_pro_c1: if c in bp_chid: bp1_chid_idx.append(bp_chid.index(c)) bp2_chid_idx = [] for c in bound_pro_c2: if c in bp_chid: bp2_chid_idx.append(bp_chid.index(c)) # header = parsePDBHeader(unbound_pro1, 'polymers') # up1_chid = [] # for polymer in header: # up1_chid.append(polymer.chid) # up1_chid_idx = [] # for c in unbound_pro1_c: # if c in up1_chid: # up1_chid_idx.append(up1_chid.index(c)) # header = parsePDBHeader(unbound_pro2, 'polymers') # up2_chid = [] # for polymer in header: # up2_chid.append(polymer.chid) # up2_chid_idx = [] # for c in unbound_pro2_c: # if c in up2_chid: # up2_chid_idx.append(up2_chid.index(c)) seqs = [] pfam1 = [] for i in range(len(bp1_chid_idx)): if i == 0: unip_raw=str(header[bp1_chid_idx[i]].dbrefs).split(" ")[1] try: unip = searchUniprotID(unip_raw) pfamid=searchPfam(unip).keys()[0] seq = pdb.getHierView()[bp_chid[bp1_chid_idx[i]]].getSequence() good = 1 except IndexError: pdb = parsePDB(bound_pro) seq = pdb.getHierView()[bp_chid[bp1_chid_idx[i]]].getSequence() pfamid=searchPfam(seq).keys()[0] good = 0 fetchPfamMSA(pfamid) pfam1.append(pfamid) raw_msa = parseMSA(pfamid + '_full.sth') if good == 1: refined_msa = refineMSA(raw_msa, label=unip) else: refined_msa = raw_msa total_msa = refined_msa total_seq = seq seqs.append(seq) else: unip_raw=str(header[bp1_chid_idx[i]].dbrefs).split(" ")[1] try: unip = searchUniprotID(unip_raw) pfamid=searchPfam(unip).keys()[0] seq = pdb.getHierView()[bp_chid[bp1_chid_idx[i]]].getSequence() good = 1 except IndexError: pdb = parsePDB(bound_pro) seq = pdb.getHierView()[bp_chid[bp1_chid_idx[i]]].getSequence() pfamid=searchPfam(seq).keys()[0] good = 0 fetchPfamMSA(pfamid) pfam1.append(pfamid) raw_msa = parseMSA(pfamid + '_full.sth') if good == 1: refined_msa = refineMSA(raw_msa, label=unip) else: refined_msa = raw_msa total_msa = mergeMSA(total_msa, refined_msa) total_seq = total_seq + seq seqs.append(seq) total_msa1 = total_msa total_seq1 = total_seq pfam2 = [] for i in range(len(bp2_chid_idx)): if i == 0: unip_raw=str(header[bp2_chid_idx[i]].dbrefs).split(" ")[1] try: unip = searchUniprotID(unip_raw) pfamid=searchPfam(unip).keys()[0] seq = pdb.getHierView()[bp_chid[bp2_chid_idx[i]]].getSequence() good = 1 except IndexError: pdb = parsePDB(bound_pro) seq = pdb.getHierView()[bp_chid[bp2_chid_idx[i]]].getSequence() pfamid=searchPfam(seq).keys()[0] good = 0 fetchPfamMSA(pfamid) pfam2.append(pfamid) raw_msa = parseMSA(pfamid + '_full.sth') if good == 1: refined_msa = refineMSA(raw_msa, label=unip) else: refined_msa = raw_msa total_msa = refined_msa total_seq = seq seqs.append(seq) else: unip_raw=str(header[bp2_chid_idx[i]].dbrefs).split(" ")[1] try: unip = searchUniprotID(unip_raw) pfamid=searchPfam(unip).keys()[0] seq = pdb.getHierView()[bp_chid[bp2_chid_idx[i]]].getSequence() good = 1 except IndexError: pdb = parsePDB(bound_pro) seq = pdb.getHierView()[bp_chid[bp2_chid_idx[i]]].getSequence() pfamid=searchPfam(seq).keys()[0] good = 0 fetchPfamMSA(pfamid) pfam1.append(pfamid) raw_msa = parseMSA(pfamid + '_full.sth') if good == 1: refined_msa = refineMSA(raw_msa, label=unip) else: refined_msa = raw_msa total_msa = mergeMSA(total_msa, refined_msa) total_seq = total_seq + seq seqs.append(seq) total_msa2 = total_msa total_seq2 = total_seq mergedMSA = specMergeMSA(total_msa1, total_msa2) finalMergedMSA = refineMSA(mergedMSA, colocc=0.8) writeMSA(bound_pro + '.fasta', finalMergedMSA) merged_seq = total_seq1 + total_seq2 g = open(bound_pro + '_one.fasta','w') g.write(">" + bound_pro + "\n") g.write(merged_seq) g.close() call(["clustalw -profile1=" + bound_pro + '.fasta -sequences -profile2=' + bound_pro + '_one.fasta'], shell=True) AlignIO.convert(bound_pro + "_one.aln","clustal",bound_pro + "_final.fasta","fasta") finalMSA = parseMSA(bound_pro + "_final.fasta") finalRefinedMSA = refineMSA(finalMSA, colocc=0.95) idx_real = finalRefinedMSA.getIndex(bound_pro) seq_from_alignment = str(finalRefinedMSA[idx_real]) res_idx_str = [] res_idx_ali = [] for seq in seqs: alignment = pairwise2.align.globalms(seq, seq_from_alignment,5,-20,-5,-1) p1res = [] p2res = [] count = 0 count1 = 0 for i in range(0,len(alignment[0][0])): if alignment[0][0][i] == '-': count = count+1 if alignment[0][1][i] == '-': count1 = count1 + 1 if alignment[0][0][i] != '-' and alignment[0][1][i] != '-': p1res.append(i-count) p2res.append(i-count1) res_idx_str.append(p1res) res_idx_ali.append(p2res) res_idx_after_ali = [] for i in range(len(res_idx_ali)): res_idx_after_ali.append([]) groups=group_consecutives(np.array(res_idx_ali[i])) for g in groups: if len(g)>10: res_idx_after_ali[i].append(g) res_idx_filtered_ali = [] for i in range(len(res_idx_after_ali)-1): for j in range(i+1,len(res_idx_after_ali)): arr1 = np.array(res_idx_after_ali[i]) arr2 = np.array(res_idx_after_ali[j]) arr1 = np.setdiff1d(arr1, arr2) res_idx_filtered_ali.append(arr1) res_idx_filtered_ali.append(np.setdiff1d(np.array(res_idx_after_ali[j]),np.ones(1)*-1)) mapping_on_str = [] for i in range(len(res_idx_filtered_ali)): mapping_on_str.append([]) for j in range(len(res_idx_filtered_ali[i])): mapping_on_str[i].append(res_idx_ali[i].index(res_idx_filtered_ali[i][j])) str_idx = [] for i in range(len(mapping_on_str)): str_idx.append([]) for j in range(len(mapping_on_str[i])): str_idx[i].append(res_idx_str[i][mapping_on_str[i][j]]) count = 0 total_length = 0 gate = 0 coords = np.array([]) for i in range(len(bp1_chid_idx)): if gate == 0: if len(str_idx[count])!=0: coords = parsePDB(bound_pro, subset='ca', chain=bp_chid[bp1_chid_idx[i]]).getCoords()[np.array(str_idx[count]),:] total_length += len(coords) count = count + 1 if len(coords) != 0: gate = 1 else: if len(str_idx[count])!=0: new_coords = parsePDB(bound_pro, subset='ca',chain=bp_chid[bp1_chid_idx[i]]).getCoords()[np.array(str_idx[count]),:] total_length += len(new_coords) count = count + 1 if len(coords)!=0 and len(new_coords)!=0: coords = np.concatenate((coords, new_coords), axis=0) for i in range(len(bp2_chid_idx)): if len(str_idx[count])!=0: new_coords = parsePDB(bound_pro, subset='ca',chain=bp_chid[bp2_chid_idx[i]]).getCoords()[np.array(str_idx[count]),:] total_length += len(new_coords) count = count + 1 if len(coords)!=0 and len(new_coords)!=0: coords = np.concatenate((coords, new_coords), axis=0) dist = buildDistMatrix(coords) dist_ini = dist < 8 PC = buildPCMatrix(finalRefinedMSA) PC_ranked = calcRankorder(PC) result = zeros(8) for z in range(len(thold)): num = int(len(PC_ranked[0])*thold[z]) for i in range(num): result[z]+=dist_ini[PC_ranked[0][i],PC_ranked[1][i]] result[z]/=num np.savetxt(bound_pro + '.res', np.array(result))
def infer_tree(clwfile, phyfile): AlignIO.convert(clwfile,"clustal",phyfile,"phylip-relaxed") phyml_exe = "tools/PhyML_3.0/PhyML_3.0_win32.exe" #cline = PhymlCommandline(phyml_exe,input=phyfile,datatype='nt',model='HKY85',alpha='e',bootstrap=-1) cline = PhymlCommandline(phyml_exe,input=phyfile,datatype='aa',model='WAG',alpha='e',bootstrap=-1) out_log, err_log = cline()
def test_distances_from_AlignIO_DNA(self): """Calculate a distance matrix from an alignment written by AlignIO.""" n = AlignIO.convert("Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip") self.assertEqual(n, 1) self.distances_from_alignment("Phylip/opuntia.phy")
#!/usr/bin/python from subprocess import call from Bio import AlignIO import os import sys try: file = sys.argv[1] except: file = raw_input("Introduce FASTA file: ") try: call("mkdir phyml", shell=True) except: pass os.chdir("phyml") AlignIO.convert("../"+file, "fasta", file+".phy", "phylip-relaxed") call("nice phyml -i %s -d nt -b 1000 -b -4 -m GTR -s BEST -v e -c 4 -a e" % (file+".phy"), shell=True)
def test_distances_from_protein_AlignIO(self): """Calculate distance matrix from an AlignIO written protein alignment.""" n = AlignIO.convert("Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip") self.assertEqual(n, 1) self.distances_from_alignment("Phylip/hedgehog.phy", DNA=False)
def get_phylogeny ( binary, infile, infile_format, args = 'default', outfile = None, outfile_format = 'newick', bootstraps = 0 ) : """ Infer the phylogeny from the input alignment using the phylogenetic inference tool and arguments given. The resultant phylogeny is returned as a Bio.Phylo.BaseTree object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. Arguments : binary ( string ) Name or path of the phylogenetic inference tool. infile ( string ) Sequence alignment file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the phylogenetic inference tool, excluding infile and outfile arguments. By default, 'default' arguments are used. outfile ( Optional[string] ) Phylogenetic tree output file. outfile_format ( Optional[string] ) Output file format. By default, NEWICK format. bootstraps ( Optional[int] ) Number of bootstraps to generate. By default, 0 (only use the input alignment). Returns : Bio.Phylo.BaseTree Resultant phylogenetic tree. float Log-likelihood score of the phylogeny. Raises : ValueError If the tool introduced isn't included in MEvoLib. IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the phylogenetic inference tool command raises an exception. * The input file format must be supported by Bio.AlignIO. * The output file format must be supported by Bio.Phylo. """ # Get the variables associated with the given phylogenetic inference tool bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if ( bin_name in _PHYLO_TOOL_TO_LIB ) : tool_lib = _PHYLO_TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS gen_args = tool_lib.gen_args get_results = tool_lib.get_results cleanup = tool_lib.cleanup else : # bin_name not in _PHYLO_TOOL_TO_LIB message = 'The phylogenetic inference tool "{}" isn\'t included in ' \ 'MEvoLib.Inference'.format(bin_name) raise ValueError(message) # Get the command line to run in order to get the resultant phylogeny infile_path = get_abspath(infile) # If the input file format is not supported by the phylogenetic inference # tool, convert it to a temporary supported file if ( infile_format.lower() not in sprt_infile_formats ) : tmpfile = tempfile.NamedTemporaryFile() AlignIO.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Create full command line list command = [binary] + gen_args(args, infile_path, bootstraps) # Run the phylogenetic inference process handling any Runtime exception try : output = subprocess.check_output(command, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e : cleanup(command) message = 'Running "{}" raised an exception'.format(' '.join(e.cmd)) raise RuntimeError(message) else : phylogeny, score = get_results(command, output) if ( outfile ) : # Save the resultant phylogeny in the given outfile and format outfile_path = get_abspath(outfile) Phylo.write(phylogeny, outfile_path, outfile_format) cleanup(command) # Return the resultant phylogeny as a Bio.Phylo.BaseTree object and its # log-likelihood score return ( phylogeny, score )
def test_parsimony_tree_from_AlignIO_DNA(self): """Make a parsimony tree from an alignment written with AlignIO.""" n = AlignIO.convert("Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip") self.assertEqual(n, 1) self.parsimony_tree("Phylip/opuntia.phy", "phylip")
def computeAlignment(self, id, alignment): "Computes multiple sequence alignment with inputed method" if alignment == "clustalw": gop = clustalw_gap_opening gep = clustalw_gap_extension d_matrix = clustalw_distance_matrix input_sequences = "./Data/" + id + ".fasta" output_align = "./Data/" + id + ".aln" output_fasta = "./Data/" + id + "_clustalw.fasta" output_tree = "./Data/" + id + ".dnd" clustalw = ClustalwCommandline(infile=input_sequences, outfile=output_align, newtree=output_tree, align="input", seqnos="ON", outorder="input", type="PROTEIN", pwmatrix=d_matrix, gapopen=gop, gapext=gep) clustalw() AlignIO.convert(output_align, "clustal", output_fasta, "fasta") try: remove(output_align) remove(output_tree) except: pass elif alignment == "muscle": iteration = muscle_max_iteration input_sequences = "./Data/" + id + ".fasta" output_align = "./Data/" + id + "_muscle.aln" output_fasta = "./Data/" + id + "_muscle.fasta" muscle = MuscleCommandline(input=input_sequences, out=output_align, clwstrict=True, maxiters=iteration) muscle() AlignIO.convert(output_align, "clustal", output_fasta, "fasta") try: remove(output_align) except: pass organism_order = [] input_sequences = "./Data/" + id + ".fasta" align = SeqIO.parse(input_sequences, "fasta", IUPAC.protein) for record in align: org = record.description organism_order.append(org) rec = dict() output_fasta = "./Data/" + id + "_muscle.fasta" align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) for record in align: org = str(record.description) seq = str(record.seq) rec[org]= seq fasta = open(output_fasta, "w") fasta.close() fasta = open(output_fasta, "a") for org in (organism_order): seq = rec[org] fasta.write(">" + org + "\n" + seq + "\n") fasta.close() else: configuration = mafft_configuration threads = mafft_threading input_sequences = "./Data/" + id + ".fasta" output_fasta = "./Data/" + id + "_mafft.fasta" if configuration == "fftnsi": if threads == False: fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft else: try: threads = int(threads) fftnsi = "mafft --retree 2 --maxiterate 1000\ --inputorder --threads %i " %(threads) mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft except: fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " mafft = system(fftnsi + input_sequences + ">" + output_fasta) mafft else: if threads == False: linsi = "mafft --localpair --maxiterate 1000 --inputorder " mafft = system(linsi + input_sequences + ">" + output_fasta) mafft else: try: threads = int(threads) linsi = "mafft --localpair --maxiterate 1000\ --inputorder --threads %i " %(threads) mafft = system(linsi + input_sequences + ">" + output_fasta) mafft except: linsi = "mafft --localpair --maxiterate 1000 --inputorder " mafft = system(linsi + input_sequences + ">" + output_fasta) mafft
def test_parsimony_from_AlignIO_protein(self): """Make a parsimony tree from protein alignment written with AlignIO.""" n = AlignIO.convert("Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip") self.parsimony_tree("Phylip/interlaced.phy", "phylip", DNA=False)
def test_bootstrap_AlignIO_DNA(self): """Pseudosample a phylip DNA alignment written with AlignIO.""" n = AlignIO.convert("Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip") self.assertEqual(n, 1) self.check_bootstrap("Phylip/opuntia.phy", "phylip")
def direct_convert(settings, id_results, out_path, out_formats, alphabet): if out_path is None: out_file = "./conv.tmp" in_path, in_format = list(id_results.items())[0] out_format = out_formats[0] if in_format == "unidentified": raise Exception("Failed to identify the file") try: format_setting = settings[in_format] if format_setting.bioclass == "seq": SeqIO.convert(in_path, in_format.lower(), out_file, out_format, alphabet) elif format_setting.bioclass == "phylo": Phylo.convert(in_path, in_format.lower(), out_file, out_format) elif format_setting.bioclass == "align": AlignIO.convert(in_path, in_format.lower(), out_file, out_format) else: print("Error: invalid BioPython conversion class: %s" % format_setting.bioclass) sys.exit(1) except ValueError as e: print("Error in conversion of " + in_path + " to " + out_format + ": " + str(e)) sys.exit(1) with open(out_file, "r") as tmp_file: print(tmp_file.read()) os.remove(out_file) # Is this really necessary? else: for out_format in out_formats: for in_path, in_format in id_results.items(): out_file = out_path if sys.platform == "win32": if out_file[-1] != "\\": out_file += "\\" out_file += ntpath.basename(in_path).split('.')[0] else: if out_file[-1] != "/": out_file += "/" out_file += os.path.basename(in_path).split('.')[0] out_extension = settings[out_format].extension out_file = out_file + "." + out_extension print("\nConverting %s file %s to %s file %s" % (in_format, in_path, out_format, out_file)) try: format_setting = settings[in_format] if format_setting.bioclass == "seq": SeqIO.convert(in_path, in_format.lower(), out_file, out_format, alphabet) elif format_setting.bioclass == "phylo": Phylo.convert(in_path, in_format.lower(), out_file, out_format) elif format_setting.bioclass == "align": AlignIO.convert(in_path, in_format.lower(), out_file, out_format) else: print("Error: invalid BioPython conversion class: %s" % format_setting.bioclass) sys.exit(1) except ValueError as e: print("\nError in conversion of " + in_path + " to " + out_format + ": " + str(e)) print("Skipping " + in_path + " ...\n") continue
def convert_alignment_format(input_msa_clustal, output_msa_phylip): AlignIO.convert(input_msa_clustal, "clustal", output_msa_phylip, "phylip-sequential") return