def main(): indexfile = open('indexfile.txt','r') for line in indexfile: files = line.split() # print "Seqfile name= ",files[0]," and aln file= ",files[1] seqs = SeqIO.to_dict(SeqIO.parse(files[0],'fasta')) # print "seqs = "+str(seqs) align = AlignIO.read(files[1],'clustal') # print "align= "+str(align) seqnames = seqs.keys() # print "seqnames = "+str(seqnames) name_idx ={} for s in seqnames: # n = s.split() # print "s = ",s," and full desc= ",seqs[s].description name_idx[s] = seqs[s].description # print "name_idx = "+str(name_idx) aln_dict = {} for x in range(0,len(align)): aln_dict[align[x].id] = x # print "aln_dict = "+str(aln_dict) for sname in name_idx: # print "sname = ",sname if aln_dict.has_key(sname): align[aln_dict[sname]].id = name_idx[sname] # print "new align should be "+str(align) newalign = open('new_'+files[1],"w") AlignIO.write(align,newalign,'clustal') newalign.close()
def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=(), alphabet=None): """Can Bio.AlignIO write files seqret can read back?""" if alphabet: old_aligns = list(AlignIO.parse(in_filename, in_format, alphabet)) else: old_aligns = list(AlignIO.parse(in_filename, in_format)) formats = ["clustal", "phylip"] if len(old_aligns) == 1: formats.extend(["fasta", "nexus"]) for temp_format in formats: if temp_format in skip_formats: continue # PHYLIP is a simple format which explicitly supports # multiple alignments (unlike FASTA). try: new_aligns = list(emboss_piped_AlignIO_convert(old_aligns, temp_format, "phylip")) except ValueError as e: # e.g. ValueError: Need a DNA, RNA or Protein alphabet # from writing Nexus files... continue try: self.assertTrue(compare_alignments(old_aligns, new_aligns)) except ValueError as err: raise ValueError("Disagree on file %s %s in %s format: %s" % (in_format, in_filename, temp_format, err))
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def read_alignment(alignment, informat, outformat, start, stop): align = AlignIO.read(alignment, informat, alphabet=generic_dna) out_basename = os.path.splitext(alignment)[0] algn_length = align.get_alignment_length() print "\nInput alignment is "+str(algn_length)+" characters." end_pos = stop if stop>algn_length: print "\nNB: you have requested an end position beyond the "+\ "length of the alignment. " end_pos = algn_length if stop<start or start<0: print "\nFatal: your begin and end positions need re-assessment."+\ " Exiting now." print "" sys.exit() outname = out_basename+"_pos"+str(start)+"to"+str(end_pos)+"."+outformat with open(outname, "w") as output_handle: algn = align[:, start:stop] AlignIO.write(algn, output_handle, outformat) print "\nExtracted "+outformat+"-formatted sub-alignment from "+\ "positions "+str(start)+" to "+str(end_pos)+" and written it to "+\ outname+". Here is a preview:" print "" print algn print ""
def emboss_piped_AlignIO_convert(alignments, old_format, new_format): """Run seqret, returns alignments (as a generator).""" # Setup, this assumes for all the format names used # Biopython and EMBOSS names are consistent! cline = SeqretCommandline(exes["seqret"], sformat=old_format, osformat=new_format, auto=True, # no prompting filter=True) # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) try: AlignIO.write(alignments, child.stdin, old_format) except Exception as err: child.stdin.close() child.stderr.close() child.stdout.close() raise child.stdin.close() child.stderr.close() # TODO - Is there a nice way to return an iterator AND # automatically close the handle? try: aligns = list(AlignIO.parse(child.stdout, new_format)) except Exception as err: child.stdout.close() raise child.stdout.close() return aligns
def save(cls, alignments, filename, schema=None): try: AlignIO.write(alignments, filename, cls.schema(filename, schema)) return True except Exception, e: print 'Unable to save alignments to: %s\n%s' % (filename, str(e)) return False
def main(): if len (sys.argv) != 4 : print "Please provide file, the file format, and the desired file format " sys.exit (1) else: f = sys.argv[1] fout = "".join(f.split('.')[:-1]) formatin = sys.argv[2] formatout = sys.argv[3] if formatout == 'nexus': AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna) if formatout == 'mega': handle = open(f, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed")) handle.close() outfile = open(fout+'.'+formatout,'w') outfile.write('#mega'+"\n") outfile.write('!Title Mytitle;'+"\n") outfile.write('!Format DataType=DNA indel=-;'+"\n\n") for n in record_dict: outfile.write('#'+n+"\n") newseq=wrap(str(record_dict[n].seq),60) for s in newseq: outfile.write(s+"\n") outfile.close() else: AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
def write_alignment(self, filename, file_format, interleaved=None): """ Write the alignment to file using Bio.AlignIO """ if file_format == 'phylip': file_format = 'phylip-relaxed' AlignIO.write(self._msa, filename, file_format)
def conversion(self, prank_number, prank_ext, format): """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename): os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual(str(cmdline), _escape_filename(prank_exe) + ' -d=%s' % self.input + ' -o="%s"' % self.output + ' -f=%i' % prank_number + ' -convert') self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) message, error = cmdline() self.assertTrue("PRANK" in message, message) self.assertTrue(("converting '%s' to '%s'" % (self.input, filename)) in message, message) self.assertEqual(error, "") self.assertTrue(os.path.isfile(filename)) old = AlignIO.read(self.input, "fasta") # Hack... if format == "phylip": for record in old: record.id = record.id[:10] new = AlignIO.read(filename, format) self.assertEqual(len(old), len(new)) for old_r, new_r in zip(old, new): self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename)
def __init__(self, file_name=None, data = None, format='fasta'): if file_name: super(Alignment, self).__init__(AlignIO.read(file_name, format)) elif data: super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format)) else: super(Alignment, self).__init__([])
def get_newick_tree(self): temp = None # quicktree expects a stockholm format input file if self.local_file.name and self.format == "stockholm": fname = self.local_file.path else: temp = tempfile.NamedTemporaryFile() print "writing stockholm format file..." AlignIO.write([self.biopy_alignment], temp, "stockholm") temp.flush() fname = temp.name print "opening quicktree on stockholm format file %s" % fname quicktree_out = os.popen('quicktree %s' % fname) # subprocess.Popen hangs the Django dev server # there should be some elementary error checking here... newick_tree = quicktree_out.read() print "quicktree finished" if temp: # 'temp' is unlinked immediately after creation--so be sure to close it only after we're certain # that quicktree succesfully opened it (i.e, only after read(), not just after popen()) temp.close() return newick_tree
def tree(alignment, run_id = 'T%05i' % (0,), bionj = False): old_cwd = os.getcwd() new_wd = config.dataPath('phyml') if not os.path.isdir(new_wd): os.mkdir(new_wd) os.chdir(new_wd) infilepath = 'infile{0}'.format(run_id) infile = open(infilepath,'w') aio.write(alignment, infile, 'phylip') infile.close() command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' ) print command subprocess.call(command, shell = True, stdout = subprocess.PIPE) treefilepath = infilepath + '_phyml_tree.txt' treefile = open(treefilepath) tree =phylo.read(treefile, 'newick') treefile.close() os.chdir(old_cwd) return tree
def design_primers(source_dir, target_dir, settings, logfile): print("\nDesigning primers using PriFi...\n", file=logfile) # get rid of previous files utils.purge_dir(target_dir) aln_files = glob(os.path.join(source_dir, '*.fasta')) print("\tChecking for empty alignments...", file=logfile) for f in aln_files: try: align = AlignIO.read(f, 'fasta') filename = os.path.basename(f) shutil.copyfile(f, os.path.join(target_dir, filename)) except Exception: print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile) continue # call PriFi for actual primer design for f in glob(os.path.join(target_dir, '*.fasta')): aln = AlignIO.read(f, 'fasta') summary = AlignInfo.SummaryInfo(aln) l = aln.get_alignment_length() primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings, logfile) if not primerpairs: print("%s: No valid primer pair found" % f, file=logfile) else: print('%s: Found %d primer pair suggestions. Writing primer files:' % (f, len(primerpairs)), file=logfile) prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
def main(args): with open(args.fasta, 'r') as handle: align = AlignIO.read(handle, "fasta") to_delete = [] old_length = align.get_alignment_length() logging.info('Examining {} columns of aligned fasta file'.format(old_length)) for pos in range(old_length): column = align[ : , pos] if column == '-' * len(column): to_delete.append(pos) if len(to_delete) > 0: logging.info('Removing {} gap-only columns: {}'.format(len(to_delete), to_delete)) to_delete.sort() to_delete.reverse() for pos in to_delete: align = align[:, :pos] + align[:, pos+1:] new_length = align.get_alignment_length() logging.info('Done! Old length: {} New length: {} Difference: {}'. format(old_length, new_length, old_length-new_length)) output_filename = os.path.basename(args.fasta) + '_degapped.fasta' with open(output_filename, 'w') as handle: AlignIO.write(align, handle, "fasta")
def multiple_alignment(fasta_dict, alignment_type=SeqTypeData().TYPE_DEFAULT): in_handle = StringIO() fasta_tools.write_fasta_handle(in_handle, fasta_dict) muscle_cmd = SeqTypeData().type2cmd[alignment_type] child = subprocess.Popen(str(muscle_cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32")) if not child: print("Process was not created!") return if sys.version_info[0] == 3: child.stdin.write(bytes(in_handle.getvalue(), 'utf-8')) child.stdin.close() align = AlignIO.read(StringIO("".join(line.decode() for line in child.stdout)), "clustal") else: child.stdin.write(in_handle.getvalue()) child.stdin.close() align = AlignIO.read(child.stdout, "clustal") fd = copy.deepcopy(fasta_dict) for a in align: fd.set(a.id, str(a.seq)) return fd
def split_family_seqs(): alis_dir = cfg.dataPath('rfam/family_alis/') meta_dir = cfg.dataPath('rfam/family_metas/') fopen = open(cfg.dataPath('rfam/Rfam.seed')) alis = aio.parse(fopen,'stockholm') while 1: infos = {} start = fopen.tell() while 1: l = fopen.readline() if l == '': break if l[0] == '#': ukey = str(l[5:7]) infos.update( [(ukey, infos.get(ukey,'') + l[8:])]) else: if l.strip() != '': break fopen.seek(start) ali = alis.next() if not ali: break rfname = infos['AC'].strip() alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w') metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w') aio.write(ali, alifile, 'fasta') pickle.dump(infos, metafile) alifile.close() metafile.close()
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input) all_taxa = set([]) for count, f in enumerate(files): #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) new_align = MultipleSeqAlignment([], generic_dna) for align in AlignIO.parse(f, 'nexus'): for seq in list(align): #pdb.set_trace() fname = os.path.splitext(os.path.basename(f))[0] new_seq_name = re.sub("^{}_*".format(fname), "", seq.name) all_taxa.add(new_seq_name) seq.id = new_seq_name seq.name = new_seq_name new_align.append(seq) assert len(all_taxa) == args.taxa, "Taxon names are not identical" outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
def _input(a): """The function converts alignments to matrix for further use. Arguments: -a- alignment file Example: >>>import os >>>import numpy as np >>>from Bio import AlignIO >>>_input("example.fasta")""" fileName, fileExtension = os.path.splitext(a) if fileExtension == ".phylip": try: l = list(AlignIO.read(a,"phylip")) except (ValueError): l = list(AlignIO.read(a,"phylip-relaxed")) except: pass elif fileExtension == ".fasta": l = list(AlignIO.read(a,"fasta")) else: raise Exception("Wrong format. Choose accepted format.") p = [[i for i in str(l[j].seq)] for j in range(0,len(l))] y = np.array(p) return(y)
def conversion(self, prank_number, prank_ext, format) : """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename) : os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual(str(cmdline), prank_exe \ + ' -d=%s' % self.input \ + ' -o="%s"' % self.output \ + ' -f=%i' % prank_number \ + ' -convert') self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(result.return_code, 0) message = stdout.read().strip() self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) \ in message, message) self.assertEqual(stderr.read(), "") self.assertEqual(str(result._cl), str(cmdline)) self.assert_(os.path.isfile(filename)) old = AlignIO.read(open(self.input), "fasta") #Hack... if format=="phylip" : for record in old : record.id = record.id[:10] new = AlignIO.read(open(filename), format) assert len(old) == len(new) for old_r, new_r in zip(old, new) : self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename)
def add(alignment, sequence, timeout, logger, wd, threads): """Align sequence(s) to an alignment using mafft (external program)""" alignment_file = "alignment_in.fasta" sequence_file = "sequence_in.fasta" output_file = "alignment_out.fasta" + ".fasta" command_line = "{0} --auto --thread {1} --add {2} {3} > {4}".format( mafft, threads, sequence_file, alignment_file, output_file ) with open(os.path.join(wd, sequence_file), "w") as file: SeqIO.write(sequence, file, "fasta") with open(os.path.join(wd, alignment_file), "w") as file: AlignIO.write(alignment, file, "fasta") pipe = TerminationPipe(command_line, timeout=timeout, cwd=wd) pipe.run() os.remove(os.path.join(wd, alignment_file)) os.remove(os.path.join(wd, sequence_file)) if not pipe.failure: try: res = AlignIO.read(os.path.join(wd, output_file), "fasta") except: logger.info(pipe.output) raise MafftError() else: os.remove(os.path.join(wd, output_file)) else: logger.debug(".... add timeout ....") return genNonAlignment(len(alignment) + 1, len(alignment.get_alignment_length())) return res
def check_bootstrap(self, filename, format, align_type="d"): """ check we can use fseqboot to pseudosample an alignment The align_type type argument is passed to the commandline object to set the output format to use (from [D]na,[p]rotein and [r]na ) """ self.assert_(os.path.isfile(filename), "Missing %s" % filename) cline = FSeqBootCommandline(exes["fseqboot"], sequence = filename, outfile = "test_file", seqtype = align_type, reps = 2, auto = True, filter = True) return_code = run_command(cline) if return_code != 0: raise ValueError("Return code %s from:\n%s" \ % (return_code, str(cline))) # the resultant file should have 2 alignments... bs = list(AlignIO.parse(open("test_file", "r" ), format)) self.assertEqual(len(bs), 2) # ..and each name in the original alignment... a_names = [s.name.replace(" ", "_") for s in AlignIO.read(open(filename, "r"), format)] # ...should be in each alignment in the bootstrapped file for a in bs: self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a])
def conversion(self, prank_number, prank_ext, format): """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename): os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual( str(cmdline), prank_exe + " -d=%s" % self.input + ' -o="%s"' % self.output + " -f=%i" % prank_number + " -convert", ) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen( str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32") ) return_code = child.wait() self.assertEqual(return_code, 0) message = child.stdout.read().strip() self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) in message, message) self.assertEqual(child.stderr.read(), "") self.assert_(os.path.isfile(filename)) old = AlignIO.read(open(self.input), "fasta") # Hack... if format == "phylip": for record in old: record.id = record.id[:10] new = AlignIO.read(open(filename), format) assert len(old) == len(new) for old_r, new_r in zip(old, new): self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename) del child
def load_tree(seqfname): """Load an alignment, build & prep a tree, return the tree object.""" if seqfname.endswith('.aln'): aln = AlignIO.read(seqfname, 'clustal') elif seqfname.endswith('.fasta'): # Run MAFFT quickly alndata = subprocess.check_output(['mafft', '--quiet', '--auto', seqfname]) aln = AlignIO.read(StringIO(alndata), 'fasta') else: raise ValueError("Input sequences must be a Clustal alignment (.aln) " "or unaligned FASTA (.fasta)") # Use conserved (less-gappy) blocks to build the tree aln = alnutils.blocks(aln, 0.4) with tempfile.NamedTemporaryFile(mode='w') as tmp: AlignIO.write(aln, tmp, 'fasta') tmp.flush() treedata = subprocess.check_output(['fasttree', '-pseudo', '-gamma', '-wag', tmp.name]) tree = Phylo.read(StringIO(treedata), 'newick') # Collapse weakly supported splits confs = [c.confidence for c in tree.find_clades() if c.confidence is not None] # ENH: accept min_confidence as an option min_confidence = math.fsum(confs) / len(confs) tree.collapse_all(lambda c: c.confidence < min_confidence) tree.ladderize(reverse=True) tree.root.branch_length = 0.0 return tree
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10): sequences1 = AlignIO.read(file1, 'fasta') sequences2 = AlignIO.read(file2, 'fasta') sequences3 = AlignIO.read(file3, 'fasta') sequences4 = AlignIO.read(file4, 'fasta') sequences5 = AlignIO.read(file5, 'fasta') sequences6 = AlignIO.read(file6, 'fasta') sequences7 = AlignIO.read(file7, 'fasta') sequences8 = AlignIO.read(file8, 'fasta') sequences9 = AlignIO.read(file9, 'fasta') sequences10 = AlignIO.read(file10, 'fasta') complete_sequences = [] for sequence1 in sequences1: strain_name = util.get_strain_name(sequence1) sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name) sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name) sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name) sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name) sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name) sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name) sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name) sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name) sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name) if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10): complete_sequence=[] complete_sequence.append(util.get_strain_name(sequence1)) complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq) complete_sequences.append(complete_sequence) return complete_sequences
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename outdir = sys.argv[3] #Output directory if os.path.isdir(outdir): #Checks the presence of directory print "Directory exists. New directory not created" else: command= "mkdir "+ outdir os.system(command) #outpath defines path of the subfolder we want to store results in outpath = outdir + '/' + sys.argv[1] command = "mkdir " + outpath os.system(command) #write the result to output align = MultipleSeqAlignment([]) output_file = outpath + '/' + filename + '.' + 'output' #print output_file #path = outdir + '/'+ output_file for i in range(len(seqs)): align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i])) AlignIO.write(align, output_file ,"fasta")
def __init__(self,aln,treef,cmd=None): if os.path.isfile(aln): self.alnfile=aln self.aln = AlignIO.read(open(self.alnfile),'fasta') else: self.aln=aln self.alnfile = tempfile.NamedTemporaryFile() AlignIO.write(aln,self.alnfile,'fasta') self.alnfile.flush() if not cmd: import sys if sys.maxint==9223372036854775807: #64 bit cmd='rate4site64' else: cmd='rate4site' if isinstance(treef,dendropy.Tree): parent_tree=treef elif os.path.isfile(treef): parent_tree=dendropy.Tree.get_from_path(treef,'newick') self.tree = narrow_tree(parent_tree,self.aln) self.treefile = tempfile.NamedTemporaryFile() # self.tree.write(self.treefile,'newick',internal_labels=False) self.treefile.write(self.tree.as_string('newick',internal_labels=False)[5:]) self.treefile.flush() self.cmd=cmd
def muscle_align_protein(recs, work_dir, outfmt="fasta", inputorder=True): """ Align given proteins with muscle. recs are iterable of Biopython SeqIO objects """ fasta_file = op.join(work_dir, "prot-start.fasta") align_file = op.join(work_dir, "prot.aln") SeqIO.write(recs, file(fasta_file, "w"), "fasta") muscle_cl = MuscleCommandline(cmd=MUSCLE_BIN("muscle"), input=fasta_file, out=align_file, seqtype="protein", clwstrict=True) stdout, stderr = muscle_cl() alignment = AlignIO.read(muscle_cl.out, "clustal") if inputorder: try: muscle_inputorder(muscle_cl.input, muscle_cl.out) except ValueError: return "" alignment = AlignIO.read(muscle_cl.out, "fasta") print >>sys.stderr, "\tDoing muscle alignment: %s" % muscle_cl if outfmt == "fasta": return alignment.format("fasta") if outfmt == "clustal": return alignment.format("clustal")
def check_convert(in_filename, in_format, out_format, alphabet=None): # Write it out using parse/write handle = StringIO() aligns = list(AlignIO.parse(in_filename, in_format, None, alphabet)) try: count = AlignIO.write(aligns, handle, out_format) except ValueError: count = 0 # Write it out using convert passing filename and handle handle2 = StringIO() try: count2 = AlignIO.convert(in_filename, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue() # Write it out using convert passing handle and handle handle2 = StringIO() try: with open(in_filename) as handle1: count2 = AlignIO.convert(handle1, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue()
def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose): input_handle = open(input_filename, "rU") output_handle = open(output_filename, "w+") alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] taxa_removed = [] number_of_included_alignments = 0 for alignment in alignments: for record in alignment: number_of_gaps = 0 number_of_gaps += record.seq.count('n') number_of_gaps += record.seq.count('N') number_of_gaps += record.seq.count('-') sequence_length = len(record.seq) if sequence_length == 0: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because there werent enough bases in it" elif((number_of_gaps*100/sequence_length) <= filter_percentage): output_alignments.append(record) number_of_included_alignments += 1 else: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed" if number_of_included_alignments <= 1: sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter") AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") output_handle.close() input_handle.close() return taxa_removed
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs): """ build maximum likelihood tree of DNA seqs with RAxML """ work_dir = op.join(work_dir, "work") mkdir(work_dir) phy_file = op.join(work_dir, "aln.phy") AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="a", model="GTRGAMMA", \ parsimony_seed=12345, rapid_bootstrap_seed=12345, \ num_replicates=100, name="aln", \ working_dir=raxml_work, **kwargs) logging.debug("Building ML tree using RAxML: %s" % raxml_cl) stdout, stderr = raxml_cl() tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work) if not op.exists(tree_file): print("***RAxML failed.", file=sys.stderr) sh("rm -rf %s" % raxml_work, log=False) return None sh("cp {0} {1}".format(tree_file, outfile), log=False) logging.debug("ML tree printed to %s" % outfile) sh("rm -rf %s" % raxml_work) return outfile, phy_file
def test_read_write_clustal(self): """Test the base alignment stuff.""" path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal", alphabet=Alphabet.Gapped( IUPAC.unambiguous_dna)) self.assertEqual(len(alignment), 7) seq_record = alignment[0] self.assertEqual(seq_record.description, "gi|6273285|gb|AF191659.1|AF191") self.assertEqual( seq_record.seq, Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA" )) seq_record = alignment[1] self.assertEqual(seq_record.description, "gi|6273284|gb|AF191658.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[2] self.assertEqual(seq_record.description, "gi|6273287|gb|AF191661.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[3] self.assertEqual(seq_record.description, "gi|6273286|gb|AF191660.1|AF191") self.assertEqual( seq_record.seq, "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[4] self.assertEqual(seq_record.description, "gi|6273290|gb|AF191664.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[5] self.assertEqual(seq_record.description, "gi|6273289|gb|AF191663.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA" ) seq_record = alignment[6] self.assertEqual(seq_record.description, "gi|6273291|gb|AF191665.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) self.assertEqual(alignment.get_alignment_length(), 156) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus() self.assertIsInstance(consensus, Seq) self.assertEqual( consensus, "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) dictionary = align_info.replacement_dictionary(["N"]) self.assertEqual(len(dictionary), 16) self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1) self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1) self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1) self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1) self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1) self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1) self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1) self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1) self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1) self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1) self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1) self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1) self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1) matrix = align_info.pos_specific_score_matrix(consensus, ["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) second_seq = alignment[1].seq matrix = align_info.pos_specific_score_matrix(second_seq, ["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 - 0.0 0.0 0.0 3.0 - 3.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) value = align_info.information_content(5, 50, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 88.42, places=2) value = align_info.information_content(chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) e_freq = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25} e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) value = align_info.information_content(e_freq_table=e_freq_table, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) self.assertEqual(align_info.get_column(1), "AAAAAAA") self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2) self.assertEqual(align_info.get_column(7), "TTTATTT") self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2) handle = StringIO() AlignInfo.print_info_content(align_info, fout=handle) self.assertEqual( handle.getvalue(), """\ 0 T 2.000 1 A 2.000 2 T 2.000 3 A 2.000 4 C 2.000 5 A 2.000 6 T 2.000 7 T 1.408 8 A 2.000 9 A 2.000 10 A 2.000 11 G 2.000 12 A 1.015 13 A 2.000 14 G 2.000 15 G 2.000 16 G 2.000 17 G 2.000 18 G 2.000 19 A 2.000 20 T 2.000 21 G 2.000 22 C 2.000 23 G 2.000 24 G 2.000 25 A 2.000 26 T 2.000 27 A 2.000 28 A 2.000 29 A 2.000 30 T 2.000 31 G 2.000 32 G 2.000 33 A 2.000 34 A 2.000 35 A 2.000 36 G 2.000 37 G 2.000 38 C 2.000 39 G 2.000 40 A 2.000 41 A 2.000 42 A 2.000 43 G 2.000 44 A 2.000 45 A 2.000 46 A 2.000 47 G 2.000 48 A 2.000 49 A 2.000 50 T 2.000 51 A 2.000 52 T 2.000 53 A 2.000 54 T 2.000 55 A 2.000 56 - 0.682 57 - 0.682 58 - 0.333 59 - 0.333 60 - -0.115 61 - -0.115 62 - -0.115 63 - -0.115 64 - -0.115 65 - -0.115 66 A 2.000 67 T 2.000 68 A 2.000 69 T 2.000 70 A 2.000 71 T 2.000 72 T 2.000 73 T 2.000 74 C 1.408 75 A 1.408 76 A 2.000 77 A 2.000 78 T 2.000 79 T 2.000 80 T 1.015 81 C 2.000 82 C 2.000 83 T 2.000 84 T 2.000 85 A 2.000 86 T 2.000 87 A 2.000 88 T 2.000 89 A 2.000 90 C 1.137 91 C 2.000 92 C 2.000 93 A 2.000 94 A 2.000 95 A 2.000 96 T 2.000 97 A 2.000 98 T 2.000 99 A 2.000 100 A 2.000 101 A 2.000 102 A 2.000 103 A 2.000 104 T 2.000 105 A 2.000 106 T 2.000 107 C 2.000 108 T 2.000 109 A 2.000 110 A 2.000 111 T 2.000 112 A 2.000 113 A 2.000 114 A 2.000 115 T 2.000 116 T 2.000 117 A 2.000 118 G 2.000 119 A 2.000 120 T 2.000 121 G 2.000 122 A 2.000 123 A 2.000 124 T 2.000 125 A 2.000 126 T 2.000 127 C 2.000 128 A 2.000 129 A 2.000 130 A 2.000 131 G 2.000 132 A 2.000 133 A 2.000 134 T 2.000 135 C 2.000 136 C 1.408 137 A 2.000 138 T 2.000 139 T 2.000 140 G 2.000 141 A 2.000 142 T 2.000 143 T 2.000 144 T 2.000 145 A 2.000 146 G 2.000 147 T 2.000 148 G 1.408 149 T 2.000 150 A 2.000 151 C 2.000 152 C 2.000 153 A 2.000 154 G 2.000 155 A 2.000 """)
def read_gene(self, gene): return ali_upper( AlignIO.read( os.path.join(self.path, 'binning.fulltree', gene, gene + '.fasta'), 'fasta'))
def save_atree(self, t, a, gene): bfn = os.path.join(self.path, 'genes', gene) t.write(path=bfn + '.nwk', unquoted_underscores=True, schema='newick') f = open(bfn + '.fasta', 'w') AlignIO.write(a, f, 'fasta') f.close()
from Bio import AlignIO alignment = AlignIO.read("PF05371_seed.sth", "stockholm") print ("Aligment length %i" % alignment.get_alignment_length()) for record in alignment: print("%s - %s" % (record.seq, record.id)) print ("Luz Maria Rosas Salcedo")
from Bio import AlignIO import re from common import is_valid_file parser = argparse.ArgumentParser( description='Run Phylip bootstrap analysis on protein alignments') parser.add_argument('file', type=lambda f: is_valid_file(f, parser), help='Protein alignment file in FASTA format') parser.add_argument('num_replicates', help='Number of bootstrap replicates to run') args = parser.parse_args() short_acc_to_long_acc = {} alignment = AlignIO.read(args.file, 'fasta') tempdir = tempfile.TemporaryDirectory() os.chdir(tempdir.name) infile = tempfile.NamedTemporaryFile(mode='w+t') # Remove zero padding for align in alignment: compressed_accession = re.sub(r'(?<=\D)0+', '', align.id) short_acc_to_long_acc[compressed_accession[:10]] = align.id align.id = compressed_accession AlignIO.write(alignment, infile, 'phylip') infile.seek(0) seqboot = pexpect.spawn('seqboot', encoding='utf-8') seqboot.expect(str('Please enter a new file name')) seqboot.sendline(infile.name) seqboot.expect('Y to accept')
# @Project: M_BioPy # @Last modified time: 2019-04-15T11:53:16+08:00 from Bio.Align.Applications import ClustalwCommandline from Bio import AlignIO # Set the File Path base = r"C:\Users\Nature\Desktop\M_BioPy\exp\material\BioPy_exp4" clustalw_exe = base + r"\clustalw2.exe" in_file = base + r"\inputFasta" out_file = base + r"\OutFasta.aln" # Do the Alignment clustalw_cline = ClustalwCommandline(clustalw_exe, infile=in_file, outfile=out_file) clustalw_cline() # Read the Alignment alignment = AlignIO.read(out_file, "clustal") # Write the Annotation annotationOutFile = base + r"\OutAnnotation.txt" with open(annotationOutFile, "wt") as outfile: for record in alignment: outfile.write(str(record) + "\n\n") # Split the alignment newAlignmentFile = base + r"\OutFasta2.aln" newAlignment = alignment[:, :10] + alignment[:, -10:] AlignIO.write(newAlignment, newAlignmentFile, "clustal")
def test_format_conversion(self): """Parse the alignment file and get an aligment object.""" path = os.path.join(os.curdir, "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal") self.assertEqual(format(alignment, "fasta"), opuntia_fasta) self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
def test_read_clustal1(self): """Parse an alignment file and get an aligment object.""" path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal") self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
records = [] for i, seq in enumerate(seqs): seqObj = Seq(seq, IUPAC.protein) name = 'test%d' % i recordObj = SeqRecord(seqObj, id=name, description='demo only') records.append(recordObj) outFileObj = open(fastaFileName, "w") SeqIO.write(records, outFileObj, "fasta") outFileObj.close() cmdArgs = [ 'clustalw', '-INFILE=' + fastaFileName, '-OUTFILE=' + alignFileName ] call(cmdArgs) fileObj = open(alignFileName) alignment = AlignIO.read(fileObj, "clustal") print('\nClustalW alignment\n') print("Alignment length %i" % alignment.get_alignment_length()) for record in alignment: print(record.seq, record.id) alignments = [ alignment, ] outputHandle = open("test2.phylip", "w") AlignIO.write(alignments, outputHandle, "phylip")
def test_read_clustal2(self): """Parse an alignment file and get an aligment object.""" path = os.path.join(os.curdir, "Clustalw", "cw02.aln") alignment = AlignIO.read(path, "clustal") self.assertEqual(format(alignment, "clustal"), cw02_clustal)
i += 1 return align def cut_gap_in_blocks(): # usefull for the future, not for this script n = float(len(align[0])) i = 0 while i < n: ct = 0 while i + ct < n and align[:, i + ct].count('-') / n > 0.5: ct += 1 if ct > 0: # delete columns [i:i+ct] if i == 0: align = align[:, ct:] elif i + ct == n: align = align[:, :i] else: align = align[:, :i] + align[:, i + ct:] n -= ct # seq. ct positions shorter else: # nothing to delete, proceed i += 1 return align alignment = AlignIO.read("in.txt", "stockholm") # print(alignment) out = cut_a_gap(alignment) SeqIO.write(out, "out.txt", "clustal")
opt_file, a_ratio=1.0, m_ratio=1.0): m_base = _type[0] a_base = _type[1] opt = open(opt_file, 'w') for i in range(len(a_aln[1, :])): col_a_group_nogap = [i for i in a_aln[:, i]] col_m_group_nogap = [i for i in m_aln[:, i] if i != '-'] a_ratio_est = col_a_group_nogap.count(a_base) / len(col_a_group_nogap) m_ratio_est = col_m_group_nogap.count(m_base) / correct_empty_lst( col_m_group_nogap) if (a_ratio_est >= a_ratio) and (m_ratio_est >= m_ratio): opt.write(str(i + 1) + '\n') opt.close() if __name__ == "__main__": pars = read_args(sys.argv) aln_ipt = AlignIO.read(pars['ipt_msa'], 'fasta') a_aln_obj = aln_partition(aln_ipt)[0] m_aln_obj = aln_partition(aln_ipt)[1] suspicious_pos_detector(a_aln_obj, m_aln_obj, 'CT', pars['output_CT'], pars['ancient_ratio'], pars['modern_ratio']) suspicious_pos_detector(a_aln_obj, m_aln_obj, 'GA', pars['output_GA'], pars['ancient_ratio'], pars['modern_ratio'])
def run_tree(fname, out_prefix, alphabet, true_tree=False, true_model=False, pc=0.1, true_rates=False): """ read a tree and an alignment and optimize its branch length using different types of models. the use can specify to either use the true model for optimization, just the true rates, or infer the entire model from the data. The either the true or an inferred tree-topology can be used. """ params = parse_alignment_name(fname) params['pc'] = pc prefix = os.path.dirname(fname) m = params['m'] tree = Phylo.read(tree_name(prefix, params), 'newick') if true_tree else Phylo.read( reconstructed_tree_name(prefix, params), 'newick') tree.root.branch_length = 0.001 old_bl = [] print( np.mean([x for c, x in tree.depths().items() if c.is_terminal()]) * (m if true_tree else 1.0)) print(tree.root.clades[0].branch_length / tree.root.clades[1].branch_length) # randomize branch length of true tree to allow fair comparison for n in tree.find_clades(): old_bl.append(n.branch_length) if true_tree: # rescale with mutation rate and multiply by a random number between 0.6 and 1.0 n.branch_length *= m * (0.6 + 0.4 * np.random.random()) print(np.sum(old_bl) * (m if true_tree else 1.0), m) # load true GTR model. Use this for inference if true_tree=True, else start with Jukes Cantor true_GTR = load_model(model_name(prefix, params)) if true_model: model = true_GTR model.mu /= m else: model = 'JC69' with gzip.open(alignment_name(prefix, params), 'rt') as fh: aln = AlignIO.read(fh, 'fasta') tt = TreeAnc(tree=tree, aln=aln, gtr=model, compress=False, alphabet=alphabet, verbose=3) # run the tree optimization of treetime. the damping parameter slows down the iterative # branch length optimization to avoid oscillations and run-away solutions # a site-specific GTR model is inferred if true_model is False tt.optimize_tree(branch_length_mode='marginal', max_iter=n_iter, infer_gtr=not true_model, site_specific_gtr=True, pc=pc, damping=0.75) # if the true raes are to be used, replace those in the model and re-optimize if true_rates: tt.gtr.mu = true_GTR.mu / m tt.optimize_tree(branch_length_mode='marginal', max_iter=n_iter, infer_gtr=False, site_specific_gtr=True, pc=pc, damping=0.75) new_bl = [] for n in tt.tree.find_clades(): new_bl.append(n.branch_length) # save new tree to file tt.tree.root_at_midpoint() tfname = reoptimized_tree_true_model( out_prefix, params) if args.true_model else reoptimized_tree( out_prefix, params, true_rates=true_rates) Phylo.write(tt.tree, tfname, 'newick') print(tt.tree.total_branch_length(), tt.gtr.average_rate().mean()) print(np.mean([x for c, x in tt.tree.depths().items() if c.is_terminal()]), tt.tree.total_branch_length()) print(tt.tree.root.clades[0].branch_length / tt.tree.root.clades[1].branch_length) print(np.corrcoef(old_bl, new_bl)[0, 1])
# Check for unique ref names if args.ref1name == args.ref2name: print('You cannot specify the same reference twice. Quitting.', file=sys.stderr) exit(1) # No commas in reference names if ',' in args.ref1name or ',' in args.ref2name: print('Reference names may not contain commas. Rename the sequences in', args.alignment, 'and try again. Quitting.', file=sys.stderr) exit(1) # Find the consensus and its ref alignment = AlignIO.read(args.alignment, "fasta") ref1seq = None ref2seq = None for seq in alignment: if seq.id == args.ref1name: if ref1seq != None: print('Found', args.ref1name, 'twice in', args.alignment + '. Quitting.', file=sys.stderr) quit(1) ref1seq = str(seq.seq) if seq.id == args.ref2name: if ref2seq != None: print('Found',
"P8": [('13984498', '14019652')] } #List of species names in the MAF file species_list = ["panPan_Y", "panTro_Y", "gorGor_Y", "ponAbe_Y", "hg_Y"] #all Y chromosome alignment file. #/nfs/brubeck.bx.psu.edu/scratch6/rahul/Bonobo_Y/analysis/palindrome/palindrome_coverage/multi_alignment_based input_handle = open("msa/alignment.hg_Y_centric.20191126.maf", "rU") output_handle = open(inputP + "_sizeNonRepeatBlock.tab", "w") #Obtaining the palindrome of interest Palindrome = Palindrome_coordinates[inputP] #File handle of MAF alignments = AlignIO.parse(input_handle, "maf") #Block overlap cutoff percent_cutoff = 0.95 pal_size = (int(Palindrome[0][1]) - int(Palindrome[0][0])) + 1 sequence_Bon = [0] * pal_size sequence_Gor = [0] * pal_size sequence_Orang = [0] * pal_size sequence_Chimp = [0] * pal_size #sequence_Human=[0]*pal_size #Reading each block in the MAF file for msa in alignments: for region in Palindrome: #Obtain the location of palindromes region_start = region[0] region_end = region[1]
from Bio import AlignIO import glob import statistics ##################################################################### #here the actual program begins, i'm storing the file names and #annotations in their respective lists ##################################################################### nomes = [] sizes = [] scores = [] identities = [] similarities = [] gaps = [] for file in glob.glob("*.align"): current = AlignIO.read(open(file), "emboss") size = current.get_alignment_length() nomes = nomes + [file] scores = scores + [current.annotations["score"]] identities = identities + [current.annotations["identity"] / size * 100] similarities = similarities + [ current.annotations["similarity"] / size * 100 ] gaps = gaps + [current.annotations["gaps"] / size * 100 ] #im storing the percentages sizes = sizes + [size] ##################################################################### #now i have to take the elements of these lists and make decrescent #sorted ones (by the scores) ##################################################################### nomes2 = [] scores2 = []
elif "NM_031542.2" == line.id: record = SeqRecord(line.seq, "Rat") fixed_sequences.append(record) elif "NM_204276.2" == line.id: record = SeqRecord(line.seq, "Chicken") fixed_sequences.append(record) SeqIO.write(fixed_sequences, output_handle, "fasta") input_handle.close() output_handle.close() # convert the clustalW format to phylip for the program from Bio import AlignIO AlignIO.convert("BRCA2_family_fixed.fasta", "fasta", "BRCA2_family.phy", "phylip") # Read the sequences and align aln = AlignIO.read('BRCA2_family.phy', 'phylip') # create a starting tree with NJ calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor(calculator, 'nj') starting_tree = constructor.build_tree(aln) # A substitution cost matrix, used from in-lecture excise (penalty of 2 for transversion and gap, penalty of 1 for # # transition) cost_matrix = [[0], [2,0],
Tree_names[i]: Genomes_names[i] for i in range(0, len(Genomes_names)) } Reference_species = Ref_tags['Species'].to_list() Reference_idents = Ref_tags['Tag'].to_list() Refs_tag_dict = { Reference_species[i]: Reference_idents[i] for i in range(0, len(Reference_species)) } """ FUNCTIONS """ refs_dict = {} alignment_obj = AlignIO.read(cds_alignment, "clustal") for record in alignment_obj: curr = record.id if "PD" not in curr: refs_dict[curr] = [] else: for key1, value1 in Refs_tag_dict.items(): if curr.startswith(str(value1)): genome = key1 for key2, value2 in Species_tag_dict.items(): if value2 == genome: refs_dict[key2].append(curr) #Let's iterate and take the human sequence as our reference human_aligned_cds = [ record.seq for record in alignment_obj if record.id == "Homo_sapiens"
def write(sequences, handle, format): """Write complete set of sequences to a file. Arguments: - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(handle, SeqRecord): raise TypeError("Check arguments, handle should NOT be a SeqRecord") if isinstance(handle, list): # e.g. list of SeqRecord objects raise TypeError("Check arguments, handle should NOT be a list") if isinstance(sequences, SeqRecord): # This raised an exception in older versions of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %r" % alignment_count count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %r" \ % (format, count) return count
def __init__(self, file): self.__alignment = AlignIO.read(file, "fasta")
inactiveThreshold = float(sys.argv[3]) activityInfile = sys.argv[4] activityColumn = sys.argv[5] if len(sys.argv) > 6: treeFile = sys.argv[6] substMatrices = Bio.SubsMat.MatrixInfo.available_matrices actives = getActives(activeThreshold, activityInfile, activityColumn, operator.lt) inactives = getActives(inactiveThreshold, activityInfile, activityColumn, operator.gt) if not treeFile: from Bio import AlignIO pocketAlignment = AlignIO.read(open(pocketAlignmentFile), "fasta") print("Calculating Distance Matrix") for substMat in substMatrices: try: alnFile = pocketAlignmentFile.split("/")[-1] fname = "autotree/" + substMat + "_" + alnFile + ".pxml" print(fname) if (os.path.exists(fname)): tree = Phylo.read(fname, "phyloxml") else: calculator = DistanceCalculator(substMat) dm = calculator.get_distance(pocketAlignment) print("Building tree") constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(pocketAlignment) Phylo.write(tree, fname, "phyloxml")
def parse(handle, format, alphabet=None): r"""Turns a sequence file into an iterator returning SeqRecords. Arguments: - handle - handle to the file, or the filename as a string (note older versions of Biopython only took a handle). - format - lower case string describing the file format. - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. format="fasta" or "tab") Typical usage, opening a file to read in, and looping over the record(s): >>> from Bio import SeqIO >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta"): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet SingleLetterAlphabet() For file formats like FASTA where the alphabet cannot be determined, it may be useful to specify the alphabet explicitly: >>> from Bio import SeqIO >>> from Bio.Alphabet import generic_dna >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta", generic_dna): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet DNAAlphabet() If you have a string 'data' containing the file contents, you must first turn this into a handle in order to parse it: >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n" >>> from Bio import SeqIO >>> try: ... from StringIO import StringIO # Python 2 ... except ImportError: ... from io import StringIO # Python 3 ... >>> for record in SeqIO.parse(StringIO(data), "fasta"): ... print("%s %s" % (record.id, record.seq)) Alpha ACCGGATGTA Beta AGGCTCGGTTA Use the Bio.SeqIO.read(...) function when you expect a single record only. """ # NOTE - The above docstring has some raw \n characters needed # for the StringIO example, hence the whole docstring is in raw # string mode (see the leading r before the opening quote). from Bio import AlignIO # Hack for SFF, will need to make this more general in future if format in _BinaryFormats: mode = 'rb' else: mode = 'rU' # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %r" % alphabet) with as_handle(handle, mode) as fp: # Map the file format to a sequence iterator: if format in _FormatToIterator: iterator_generator = _FormatToIterator[format] if alphabet is None: i = iterator_generator(fp) else: try: i = iterator_generator(fp, alphabet=alphabet) except TypeError: i = _force_alphabet(iterator_generator(fp), alphabet) elif format in AlignIO._FormatToIterator: # Use Bio.AlignIO to read in the alignments i = (r for alignment in AlignIO.parse(fp, format, alphabet=alphabet) for r in alignment) else: raise ValueError("Unknown format '%s'" % format) # This imposes some overhead... wait until we drop Python 2.4 to fix it for r in i: yield r
if __name__ == "__main__": print("Quick test") from Bio import AlignIO from Bio.Align.Generic import Alignment filename = "../../Tests/GFF/multi.fna" format = "fasta" expected = FreqTable.FreqTable({ "A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25 }, FreqTable.FREQ, IUPAC.unambiguous_dna) alignment = AlignIO.read(open(filename), format) for record in alignment: print(record.seq) print("=" * alignment.get_alignment_length()) summary = SummaryInfo(alignment) consensus = summary.dumb_consensus(ambiguous="N") print(consensus) consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print( summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") # Have a generic alphabet, without a declared gap char, so must tell
seq_dict[key].append( textwrap.fill(str(records_dict[key].seq), width=60)) # formated in fasta alignments = [] for k in seq_dict: out_fasta = os.path.join(snakemake.params['TMP_D'], k + '.fa') out_aln = os.path.join(snakemake.params['TMP_D'], k + '.aln') alignments.append(out_aln) with open(out_fasta, 'w') as out_fh: out_fh.write('\n'.join(seq_dict[k])) aln = MafftCommandline(quiet=True, retree=1, thread=cores, nuc=True, globalpair=True, input=out_fasta) # print(aln()) with open(out_aln, 'w') as out_fh: out_fh.write('\n'.join(aln())) one_big_aln = AlignIO.read(alignments[0], 'fasta') one_big_aln.sort() for f in alignments: aln = AlignIO.read(f, 'fasta') aln.sort() one_big_aln = one_big_aln + aln with open(one_big_aln_f, 'w') as out_fh: AlignIO.write(one_big_aln, out_fh, 'fasta')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input') parser.add_argument('-o', '--output') parser.add_argument('-v', dest='verbose', action='store_true') args = parser.parse_args() chrsize = defaultdict(lambda: int()) data = defaultdict(lambda: int()) other = defaultdict(lambda: int()) outdir = 'ortholog_maf_dm_clean' if not os.path.exists(outdir): os.mkdir(outdir) maf_raw = glob.glob('./ortholog_maf_clean/*.axt.chain.prenet.net.axt.maf') s = re.compile(r'(\d+)$') print 'Clean_maf\tNumOfAlign' for maf in sorted(maf_raw): #print maf aligns = AlignIO.parse(maf, 'maf') aligns_new = '%s/%s' % (outdir, os.path.split(maf)[-1]) aligns_clean = [] #determine start point of orthologous region on chromosome sample = 200 start_q_0 = 20000000 start_t_0 = 20000000 count = 0 for align in aligns: count += 1 if count > sample: break chr_qry = s.search(align[0].id).groups(0)[0] if s.search( align[0].id) else 'NA' chr_tar = s.search(align[1].id).groups(0)[0] if s.search( align[1].id) else 'NA' start_q = align[0].annotations['start'] start_t = align[1].annotations['start'] if int(chr_qry) == int(chr_tar): if int(start_t) < start_t_0: start_t_0 = int(start_t) start_q_0 = int(start_q) print start_q_0, start_t_0 #dynamic programming of identification of orthologous alignment max_interval = 4000000 # we allow adjacent orthologous region to have less than 500 kb interval aligns = AlignIO.parse(maf, 'maf') flag = 0 # allowed for backward for align in aligns: chr_qry = s.search(align[0].id).groups(0)[0] if s.search( align[0].id) else 'NA' chr_tar = s.search(align[1].id).groups(0)[0] if s.search( align[1].id) else 'NA' start_q = align[0].annotations['start'] start_t = align[1].annotations['start'] size_q = align[0].annotations['size'] size_t = align[1].annotations['size'] strand = align[1].annotations['strand'] if not chrsize.has_key(chr_qry): chrsize[chr_qry] = align[0].annotations['srcSize'] print '>%s\t%s\t%s\t%s\t%s' % (chr_qry, chr_tar, start_q, start_t, strand) if int(chr_qry) == int(chr_tar) and strand == '+1' and int( start_q) >= int(start_q_0): ref_d = abs(int(start_q) - int(start_q_0)) tar_d = abs(int(start_t) - int(start_t_0)) ##orthologous alignment print '%s\t%s\t%s\t%s\t%s' % (chr_qry, chr_tar, start_q, start_t, flag) ##small step, not allowed to backward if flag == 0: #allowed to backward within 200kb #if int(start_t) < int(start_t_0) - 200000: # print 'flag0 no' # other[chr_qry] += size_q # continue if (tar_d < ref_d + max_interval): aligns_clean.append(align) data[chr_qry] += size_q start_q_0 = int(start_q) start_t_0 = int(start_t) flag = update_flag(int(flag), int(ref_d), int(tar_d)) print 'flag0 yes' else: print 'flag0 no' other[chr_qry] += size_q ##previous step is large, allowed for backward elif flag == 1: if (tar_d < ref_d + max_interval): aligns_clean.append(align) data[chr_qry] += size_q if int(start_t) < int(start_t_0): flag = 0 start_q_0 = int(start_q) start_t_0 = int(start_t) print 'flag1 yes' else: print 'flag1 no' other[chr_qry] += size_q #if ref_d < 100000 and tar_d < 1000000 and flag == 0: # flag = 0 #elif ref_d < 100000 and tar_d > 100000 and flag == 0: # flag = 1 #if (tar_d < ref_d + max_interval): # aligns_clean.append(align) # data[chr_qry] += size_q # start_q_0 = int(start_q) # start_t_0 = int(start_t) ##not orthologous alignment #else: # other[chr_qry] += size_q ##not orthologous alignment else: other[chr_qry] += size_q count = AlignIO.write(aligns_clean, aligns_new, 'maf') print aligns_new, count print 'Chr\tSize\tAlignedSize\tAlignedRate\tRawAlignedRate' total = 0 aligned = 0 aligned_o = 0 for c in sorted(chrsize.keys()): print 'Chr%s\t%s\t%s\t%s\t%s' % ( c, chrsize[c], data[c], float(data[c]) / float(chrsize[c]), (float(data[c]) + float(other[c])) / float(chrsize[c])) total += int(chrsize[c]) aligned += int(data[c]) aligned_o += int(other[c]) print 'Total\t%s\t%s\t%s\t%s' % ( total, aligned, float(aligned) / float(total), (float(aligned) + float(aligned_o)) / float(total))
from Bio.Alphabet import generic_dna from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment from Bio.Seq import Seq align = MultipleSeqAlignment([ SeqRecord(Seq('ACGTACGTACGTACGT', generic_dna), id='quercus_robur'), SeqRecord(Seq('ACGTCCGTACTTACGA', generic_dna), id='quercus_ilex'), SeqRecord(Seq('CCGTCCGGACATACGA', generic_dna), id='quercus_rubra'), SeqRecord(Seq('AGGTCAGTACTTGCGA', generic_dna), id='quercus_macrocarpa') ]) # ---- Begin exercise answer ---- # Write alignment to file from Bio import AlignIO AlignIO.write(align, 'example_alignment.phylip', 'phylip') # Read alignment from file (use Bio.AlignIO.read for single alignments, Bio.AlignIO.parse for multiple alignments) from Bio import AlignIO test_alignment_read = AlignIO.read('example_alignment.phylip', 'phylip') print(test_alignment_read) print('End: Q6') # 7. Write a function that BLAST-searches a given sequence in GenBank, prints out how many # matches passed a certain threshold, and returns the blast search results. def blastSearch(sequence_record, less_than_threshold): # Import required packages from Bio.Seq import Seq
def write_AlignIO_protein(): """Convert hedgehog.aln to a phylip file""" assert 1 == AlignIO.convert("Clustalw/hedgehog.aln", "clustal", "Phylip/hedgehog.phy", "phylip")
def PPHMMDBConstruction ( GenomeSeqFile, ShelveDir, ProteinLength_Cutoff = 100, IncludeIncompleteGenomes = True, BLASTp_evalue_Cutoff = 1E-3, BLASTp_PercentageIden_Cutoff = 50, BLASTp_QueryCoverage_Cutoff = 75, BLASTp_SubjectCoverage_Cutoff = 75, BLASTp_num_alignments = 1000000, BLASTp_N_CPUs = 20, MUSCLE_GapOpenCost = -3.0, MUSCLE_GapExtendCost = -0.0, ProtClustering_MCLInflation = 2, N_AlignmentMerging = 0, HHsuite_evalue_Cutoff = 1E-6, HHsuite_pvalue_Cutoff = 0.05, HHsuite_N_CPUs = 10, HHsuite_QueryCoverage_Cutoff = 85, HHsuite_SubjectCoverage_Cutoff = 85, PPHMMClustering_MCLInflation = 5, HMMER_PPHMMDB_ForEachRoundOfPPHMMMerging = True, ): print "################################################################################" print "#Build a database of virus protein profile hidden Markov models (PPHMMs) #" print "################################################################################" ''' Build a database of virus protein profile hidden Markov models (PPHMMs). --------------------------------------------- ''' ################################################################################ print "- Define dir/file paths" ################################################################################ print "\tto BLASTp shelve directory" #------------------------------------------------------------------------------- BLASTMainDir = ShelveDir+"/BLAST" if os.path.exists(BLASTMainDir): _ = subprocess.call("rm -rf %s" %BLASTMainDir, shell = True) os.makedirs(BLASTMainDir) print "\t\tto BLASTp query file" #------------------------------------------------------------------------------- BLASTQueryFile = BLASTMainDir+"/Query.fasta" print "\t\tto BLASTp subject file" #------------------------------------------------------------------------------- BLASTSubjectFile = BLASTMainDir+"/Subjects.fasta" print "\t\tto BLASTp output file" #------------------------------------------------------------------------------- BLASTOutputFile = BLASTMainDir+"/BLASTOutput.txt" print "\t\tto BLASTp bit score matrix file" #------------------------------------------------------------------------------- BLASTBitScoreFile = BLASTMainDir+"/BitScoreMat.txt" print "\t\tto protein cluster file" #------------------------------------------------------------------------------- BLASTProtClusterFile = BLASTMainDir+"/ProtClusters.txt" print "\t\tto protein cluster directory" #------------------------------------------------------------------------------- ClustersDir = BLASTMainDir+"/Clusters";os.makedirs(ClustersDir) print "\tto HMMER shelve directory" #------------------------------------------------------------------------------- HMMERDir = ShelveDir+"/HMMER" if os.path.exists(HMMERDir): _ = subprocess.call("rm -rf %s" %HMMERDir, shell = True) os.makedirs(HMMERDir) print "\t\tto HMMER PPHMM directory" #------------------------------------------------------------------------------- HMMER_PPHMMDir = HMMERDir+"/HMMER_PPHMMs";os.makedirs(HMMER_PPHMMDir) print "\t\tto HMMER PPHMM database directory" #------------------------------------------------------------------------------- HMMER_PPHMMDBDir = HMMERDir+"/HMMER_PPHMMDB";os.makedirs(HMMER_PPHMMDBDir) print "\t\t\tto HMMER PPHMM database" #------------------------------------------------------------------------------- HMMER_PPHMMDB = HMMER_PPHMMDBDir+"/HMMER_PPHMMDB" if N_AlignmentMerging != 0: print "\tto HHsuite shelve directory" #------------------------------------------------------------------------------- HHsuiteDir = ShelveDir+"/HHsuite" if os.path.exists(HHsuiteDir): _ = subprocess.call("rm -rf %s" %HHsuiteDir, shell = True) os.makedirs(HHsuiteDir) print "\t\tto HHsuite PPHMM directory" #------------------------------------------------------------------------------- HHsuite_PPHMMDir = HHsuiteDir + "/HHsuite_PPHMMs";os.makedirs(HHsuite_PPHMMDir) print "\t\tto HHsuite PPHMM database directory" #------------------------------------------------------------------------------- HHsuite_PPHMMDBDir= HHsuiteDir +"/HHsuite_PPHMMDB";os.makedirs(HHsuite_PPHMMDBDir) print "\t\t\tto HHsuite PPHMM database" #------------------------------------------------------------------------------- HHsuite_PPHMMDB = HHsuite_PPHMMDBDir+"/HHsuite_PPHMMDB" print "\tto program output shelve" #------------------------------------------------------------------------------- VariableShelveDir = ShelveDir+"/Shelves" ################################################################################ print "- Retrieve variables" ################################################################################ if IncludeIncompleteGenomes == True: print "\tfrom ReadGenomeDescTable.AllGenomes.shelve" #------------------------------------------------------------------------------- VariableShelveFile = VariableShelveDir+"/ReadGenomeDescTable.AllGenomes.shelve" Parameters = shelve.open(VariableShelveFile) for key in [ "BaltimoreList", "OrderList", "FamilyList", "SubFamList", "GenusList", "VirusNameList", "TaxoGroupingList", "SeqIDLists", "TranslTableList"]: globals()[key] = Parameters[key] print "\t\t"+key Parameters.close() elif IncludeIncompleteGenomes == False: print "\tfrom ReadGenomeDescTable.CompleteGenomes.shelve" #------------------------------------------------------------------------------- VariableShelveFile = VariableShelveDir+"/ReadGenomeDescTable.CompleteGenomes.shelve" Parameters = shelve.open(VariableShelveFile) for key in [ "BaltimoreList", "OrderList", "FamilyList", "SubFamList", "GenusList", "VirusNameList", "TaxoGroupingList", "SeqIDLists", "TranslTableList"]: globals()[key] = Parameters[key] print "\t\t"+key Parameters.close() if not os.path.isfile(GenomeSeqFile): ################################################################################ print "- Download GenBank file" ################################################################################ print "GenomeSeqFile doesn't exist. GRAViTy is downloading the GenBank file(s)" print "Here are the accession numbers to be downloaded: " print "\n".join(map(lambda x:"\n".join(x), SeqIDLists)) DownloadGenBankFile (GenomeSeqFile = GenomeSeqFile, SeqIDLists = SeqIDLists) ################################################################################ print "- Read GenBank file" ################################################################################ GenBankDict = SeqIO.index(GenomeSeqFile, "genbank") GenBankDict = {k.split(".")[0]:v for k,v in GenBankDict.iteritems()} ################################################################################ print "- Extract/predict protein sequences from virus genomes, excluding proteins with lengthes <%s aa"%ProteinLength_Cutoff ################################################################################ ProtList = [] ProtIDList = [] N_Viruses = len(SeqIDLists) Virus_i = 1.0 for SeqIDList, TranslTable, BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping in zip(SeqIDLists, TranslTableList, BaltimoreList, OrderList, FamilyList, SubFamList, GenusList, VirusNameList, TaxoGroupingList): for SeqID in SeqIDList: GenBankRecord = GenBankDict[SeqID] GenBankID = GenBankRecord.name GenBankFeatures = GenBankRecord.features #Extract protein sequences #------------------------------------------------------------------------------- ContainProtAnnotation = 0 for Feature in GenBankFeatures: if(Feature.type == 'CDS' and Feature.qualifiers.has_key("protein_id") and Feature.qualifiers.has_key("translation")): ContainProtAnnotation = 1 try: ProtName = Feature.qualifiers["product"][0] except KeyError: try: ProtName = Feature.qualifiers["gene"][0] except KeyError: try: ProtName = Feature.qualifiers["note"][0] except KeyError: ProtName = "Hypothetical protein" ProtID = Feature.qualifiers["protein_id"][0] ProtSeq = Feature.qualifiers["translation"][0] if len(ProtSeq) >= ProteinLength_Cutoff: ProtRecord = SeqRecord( Seq(ProtSeq), id = GenBankID+"|"+ProtID, name = GenBankID+"|"+ProtID, description = ProtName, annotations = {'taxonomy':[BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping]}) ProtList.append(ProtRecord) ProtIDList.append(GenBankID+"|"+ProtID) if ContainProtAnnotation == 0: #if the genome isn't annotated with any ORFs #Identifying ORFs #------------------------------------------------------------------------------- if TranslTable==1: Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==2: Starts = "----------**--------------------MMMM----------**---M------------" elif TranslTable==3: Starts = "----------**----------------------MM----------------------------" elif TranslTable==4: Starts = "--MM------**-------M------------MMMM---------------M------------" elif TranslTable==5: Starts = "---M------**--------------------MMMM---------------M------------" elif TranslTable==6: Starts = "--------------*--------------------M----------------------------" elif TranslTable==7: Starts = "--MM------**-------M------------MMMM---------------M------------" elif TranslTable==8: Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==9: Starts = "----------**-----------------------M---------------M------------" elif TranslTable==10: Starts = "----------**-----------------------M----------------------------" elif TranslTable==11: Starts = "---M------**--*----M------------MMMM---------------M------------" elif TranslTable==12: Starts = "----------**--*----M---------------M----------------------------" elif TranslTable==13: Starts = "---M------**----------------------MM---------------M------------" elif TranslTable==14: Starts = "-----------*-----------------------M----------------------------" elif TranslTable==15: Starts = "----------*---*--------------------M----------------------------" elif TranslTable==16: Starts = "----------*---*--------------------M----------------------------" elif TranslTable==17: print "Genetic code table 17 doesn't exist. Use the stardard code" Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==18: print "Genetic code table 18 doesn't exist. Use the stardard code" Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==19: print "Genetic code table 19 doesn't exist. Use the stardard code" Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==20: print "Genetic code table 20 doesn't exist. Use the stardard code" Starts = "---M------**--*----M---------------M----------------------------" elif TranslTable==21: Starts = "----------**-----------------------M---------------M------------" elif TranslTable==22: Starts = "------*---*---*--------------------M----------------------------" elif TranslTable==23: Starts = "--*-------**--*-----------------M--M---------------M------------" elif TranslTable==24: Starts = "---M------**-------M---------------M---------------M------------" elif TranslTable==25: Starts = "---M------**-----------------------M---------------M------------" elif TranslTable==26: Starts = "----------**--*----M---------------M----------------------------" elif TranslTable==27: Starts = "--------------*--------------------M----------------------------" elif TranslTable==28: Starts = "----------**--*--------------------M----------------------------" elif TranslTable==29: Starts = "--------------*--------------------M----------------------------" elif TranslTable==30: Starts = "--------------*--------------------M----------------------------" elif TranslTable==31: Starts = "----------**-----------------------M----------------------------" else: print "Genetic code table isn't specified or is out of range. Use the stardard code" Starts = "---M------**--*----M---------------M----------------------------" CodonList = [Base1+Base2+Base3 for Base1 in "TCAG" for Base2 in "TCAG" for Base3 in "TCAG"] StartCodonList = [] StopCodonList = [] for i,j in enumerate(Starts): if j == "M": StartCodonList.append(CodonList[i]) if j == "*": StopCodonList.append(CodonList[i]) GenBankSeq = GenBankRecord.seq SeqLength = len(GenBankSeq) ORF_i = 0 for strand, nuc in [(+1, GenBankSeq), (-1, GenBankSeq.reverse_complement())]: for frame in range(3): length = 3 * ((SeqLength-frame) // 3) #Multiple of three nuc_inframe = nuc[frame:(frame+length)] #In-frame nucleotide sequence nuc_codonList = [str(nuc_inframe[i:i+3]) for i in range(0, length, 3)] #Split the in-frame nucleotide sequence into codons StopCodon_indices = [i for i, codon in enumerate(nuc_codonList) if codon in StopCodonList] #Find stop codons Coding_Start_IndexList = np.array([-1]+StopCodon_indices)+1 Coding_End_IndexList = np.array(StopCodon_indices+[len(nuc_codonList)]) ProtSeqList = [] for i, j in zip(Coding_Start_IndexList, Coding_End_IndexList): for k, codon in enumerate(nuc_codonList[i:j]): if codon in StartCodonList: ProtSeqList.append(Seq("".join(nuc_codonList[i:j][k:])).translate(table = TranslTable)) break for ProtSeq in ProtSeqList: if len(ProtSeq) >= ProteinLength_Cutoff: #Exclude protein sequences with <'ProteinLength_Cutoff' aa ProtRecord = SeqRecord( ProtSeq, id = GenBankID+"|ORF%s"%ORF_i, name = GenBankID+"|ORF%s"%ORF_i, description = "Hypothetical protein", annotations = {'taxonomy':[BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping]}) ProtList.append(ProtRecord) ProtIDList.append(GenBankID+"|ORF%s"%ORF_i) ORF_i = ORF_i + 1 #Progress bar sys.stdout.write("\033[K" + "Extract protein sequences: [%-20s] %d/%d viruses" % ('='*int(Virus_i/N_Viruses*20), Virus_i, N_Viruses) + "\r") sys.stdout.flush() Virus_i = Virus_i + 1.0 sys.stdout.write("\033[K") sys.stdout.flush() ProtIDList = np.array(ProtIDList) ################################################################################ print "- ALL-VERSUS-ALL BLASTp" ################################################################################ print "\tMake BLASTp database" #------------------------------------------------------------------------------- with open(BLASTSubjectFile, "w") as BLASTSubject_txt: SeqIO.write(ProtList, BLASTSubject_txt, "fasta") _ = subprocess.Popen("makeblastdb -in %s -dbtype prot" %BLASTSubjectFile, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if err != "": print "Something is wrong with makeblastdb:" print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." print "\tPerforme ALL-VERSUS-ALL BLASTp analysis" #------------------------------------------------------------------------------- BitScoreMat = [] SeenPair = {} SeenPair_i = 0 N_ProtSeqs = len(ProtList) #Set Blastp outfile format #------------------------------------------------------------------------------- BLASTp_outfmt = '"6 qseqid sseqid pident qcovs qlen slen evalue bitscore"' for ProtSeq_i in range(N_ProtSeqs): #BLAST query fasta file #------------------------------------------------------------------------------- BLASTQuery = ProtList[ProtSeq_i] with open(BLASTQueryFile, "w") as BLASTQuery_txt: p = SeqIO.write(BLASTQuery, BLASTQuery_txt, "fasta") #Perform BLASTp #------------------------------------------------------------------------------- _ = subprocess.Popen('blastp -query %s -db %s -out %s -evalue %s -outfmt %s -num_alignments %s -num_threads %s' %( BLASTQueryFile, BLASTSubjectFile, BLASTOutputFile, BLASTp_evalue_Cutoff, BLASTp_outfmt, BLASTp_num_alignments, BLASTp_N_CPUs), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if err != "": print "Something is wrong with blastp (protein ID = %s):"%ProtList[ProtSeq_i].id print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." #BitScoreMat conditioned on PIden, QCovs, and SCovs #------------------------------------------------------------------------------- if os.stat(BLASTOutputFile).st_size != 0: #if BLAST returns something... with open(BLASTOutputFile, "r") as BLASTOutput_txt: for BLASTHit in BLASTOutput_txt.readlines(): if BLASTHit == "\n": break Line = BLASTHit.split("\t") qseqid = Line[0] sseqid = Line[1] pident = float(Line[2]) qcovs = float(Line[3]) qlen = float(Line[4]) slen = float(Line[5]) evalue = float(Line[6]) bitscore = float(Line[7][:-1]) [SeqID_I, SeqID_II] = sorted([qseqid, sseqid]) Pair = ", ".join([SeqID_I, SeqID_II]) if ((qseqid != sseqid) and (pident >= BLASTp_PercentageIden_Cutoff) and (qcovs >= BLASTp_QueryCoverage_Cutoff) and ((qcovs*qlen/slen) >= BLASTp_SubjectCoverage_Cutoff)): if Pair in SeenPair: #If the pair has already been seen... if bitscore > BitScoreMat[SeenPair[Pair]][2]: #and if the new bitscore is higher... BitScoreMat[SeenPair[Pair]][2] = bitscore else: SeenPair[Pair] = SeenPair_i BitScoreMat.append([SeqID_I, SeqID_II, bitscore]) SeenPair_i = SeenPair_i+1 #Progress bar sys.stdout.write("\033[K" + "BLASTp: [%-20s] %d/%d proteins" % ('='*int(float(ProtSeq_i+1)/N_ProtSeqs*20), ProtSeq_i+1, N_ProtSeqs) + "\r") sys.stdout.flush() sys.stdout.write("\033[K") sys.stdout.flush() BitScoreMat = np.array(BitScoreMat) print "\tSave protein-protein similarity scores (BLASTp bit scores)" #------------------------------------------------------------------------------- np.savetxt( fname = BLASTBitScoreFile, X = BitScoreMat, fmt = '%s', delimiter= "\t", header = "SeqID_I\tSeqID_II\tBit score") ################################################################################ print "- Cluster protein sequences based on BLASTp bit scores, using the MCL algorithm" ################################################################################ _ = subprocess.Popen("mcl %s --abc -o %s -I %s" %(BLASTBitScoreFile, BLASTProtClusterFile, ProtClustering_MCLInflation), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) err, out = _.communicate() if err != "": print "Something is wrong with mcl:" print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit ("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." SeenProtIDList = [] with open(BLASTProtClusterFile, 'r') as BLASTProtCluster_txt: for Cluster in BLASTProtCluster_txt.readlines(): SeenProtIDList.extend(Cluster.split("\r\n")[0].split("\n")[0].split("\t")) with open(BLASTProtClusterFile, 'a') as BLASTProtCluster_txt: BLASTProtCluster_txt.write("\n".join(list(set(ProtIDList)-set(SeenProtIDList)))) ################################################################################ print "- Make protein alignments" ################################################################################ N_Clusters = LineCount(BLASTProtClusterFile)+1 #Count the number of clusters Cluster_i = 0 Cluster_MetaDataDict = {} with open(BLASTProtClusterFile, 'r') as BLASTProtCluster_txt: for Cluster in BLASTProtCluster_txt.readlines(): HitList = [] TaxoLists = [] DescList = [] Cluster = Cluster.split("\n")[0].split("\t") for ProtID in Cluster: HitList.append(ProtList[np.where(ProtIDList == ProtID)[0][0]]) TaxoLists.append(HitList[-1].annotations['taxonomy']) DescList.append(HitList[-1].description.replace(", "," ").replace(","," ").replace(": ","_").replace(":","_").replace("; "," ").replace(";"," ").replace(" (","/").replace("(","/").replace(")","")) #Cluster file #------------------------------------------------------------------------------- UnAlnClusterFile = ClustersDir+"/Cluster_%s.fasta" %Cluster_i with open(UnAlnClusterFile, "w") as UnAlnClusterTXT: p = SeqIO.write(HitList, UnAlnClusterTXT, "fasta") #align cluster using muscle #------------------------------------------------------------------------------- AlnClusterFile = ClustersDir+"/Cluster_%s.fasta" %Cluster_i _ = subprocess.Popen("muscle -in %s -out %s -gapopen %s -gapextend %s" %( UnAlnClusterFile, AlnClusterFile, MUSCLE_GapOpenCost, MUSCLE_GapExtendCost), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) err, out = _.communicate() if err != "": print "Something is wrong with muscle (Cluster_%s):"%Cluster_i print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." #Cluster annotations #------------------------------------------------------------------------------- Cluster_MetaDataDict[Cluster_i] = { "Cluster":Cluster, "DescList":DescList, "TaxoLists":TaxoLists, "AlignmentLength":AlignIO.read(AlnClusterFile, "fasta").get_alignment_length() } Cluster_i = Cluster_i+1 sys.stdout.write("\033[K" + "Make protein alignments: [%-20s] %d/%d alignments" % ('='*int(float(Cluster_i)/N_Clusters*20), Cluster_i, N_Clusters) + "\r") sys.stdout.flush() sys.stdout.write("\033[K") sys.stdout.flush() if N_AlignmentMerging != 0: ################################################################################ if N_AlignmentMerging > 0: print "- Merge protein alignments, %s rounds of merging" %N_AlignmentMerging elif N_AlignmentMerging < 0: print "- Merge protein alignments until exhausted" ################################################################################ print "\tMake HHsuite PPHMMs from protein alignments" #------------------------------------------------------------------------------- for Cluster_i in range(len(Cluster_MetaDataDict)): AlnClusterFile = ClustersDir+"/Cluster_%s.fasta" %Cluster_i HHsuite_PPHMMFile = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %Cluster_i _ = subprocess.Popen("hhmake -i %s -o %s -seq %s -name Cluster_%s -id 100 -M 50 -v 0" %( AlnClusterFile, HHsuite_PPHMMFile, len(Cluster_MetaDataDict[Cluster_i]["Cluster"])+1, Cluster_i), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if err != "": print "Something is wrong with turning Cluster_%s into a PPHMM by hhmake." %Cluster_i print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit ("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." #Progress bar sys.stdout.write("\033[K" + "Make HHsuite PPHMMs: [%-20s] %d/%d PPHMMs" % ('='*int(float(Cluster_i+1)/len(Cluster_MetaDataDict)*20), Cluster_i+1, len(Cluster_MetaDataDict)) + "\r") sys.stdout.flush() sys.stdout.write("\033[K") sys.stdout.flush() print "\tMake a HHsuite PPHMM DB" #------------------------------------------------------------------------------- _ = subprocess.Popen("ffindex_build -s %s_hhm.ffdata %s_hhm.ffindex %s" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB, HHsuite_PPHMMDir), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() print "\tMerge protein alignments" #------------------------------------------------------------------------------- AlignmentMerging_i_round = 0 while True: if AlignmentMerging_i_round >= N_AlignmentMerging and N_AlignmentMerging >= 0: print "Alignment merging complete" break if HMMER_PPHMMDB_ForEachRoundOfPPHMMMerging == True: print "\t\tHMMER_PPHMMDB_ForEachRoundOfPPHMMMerging == True. Make a HMMER PPHMM DB. (Round %s)" %AlignmentMerging_i_round #------------------------------------------------------------------------------- _ = Make_HMMER_PPHMM_DB( HMMER_PPHMMDir = HMMER_PPHMMDir, HMMER_PPHMMDB = HMMER_PPHMMDBDir+"/HMMER_PPHMMDB_%s" %AlignmentMerging_i_round, ClustersDir = ClustersDir, Cluster_MetaDataDict = Cluster_MetaDataDict) _ = subprocess.Popen("find %s -type f -name '*.hmm' -delete" %HMMER_PPHMMDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() print "\t\tRound %s"%(AlignmentMerging_i_round + 1) print "\t\t\tDetermine PPHMM-PPHMM similarity scores (ALL-VERSUS-ALL hhsearch)" #------------------------------------------------------------------------------- hhsearchDir = HHsuiteDir+"/hhsearch_"+"".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)); os.makedirs(hhsearchDir) hhsearchOutFile = hhsearchDir+"/hhsearch.stdout.hhr" N_PPHMMs = LineCount("%s_hhm.ffindex"%HHsuite_PPHMMDB) SeenPair = {} SeenPair_i = 0 PPHMMSimScoreCondensedMat= [] for PPHMM_i in range(0, N_PPHMMs): HHsuite_PPHMMFile = HHsuite_PPHMMDir + "/PPHMM_%s.hhm" %PPHMM_i _ = subprocess.Popen("hhsearch -i %s -d %s -o %s -e %s -E %s -z 1 -b 1 -id 100 -global -v 0 -cpu %s" %( HHsuite_PPHMMFile, HHsuite_PPHMMDB+"_hhm.ffdata", hhsearchOutFile, HHsuite_evalue_Cutoff, HHsuite_evalue_Cutoff, HHsuite_N_CPUs, ), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if err != "": print "Something is wrong with hhsearching PPHMM %s againt the PPHMM database" %PPHMM_i print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit ("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." with open(hhsearchOutFile, 'r') as hhsearchOut_txt: Content = hhsearchOut_txt.readlines() QueryLength = int(Content[1].split()[1]) for Line in Content[9:]: if Line == "\n": break else: Line = Line.replace("("," ").replace(")"," ").split() PPHMM_j = int(Line[1].split("_")[1]) evalue = float(Line[3]) pvalue = float(Line[4]) PPHMMSimScore = float(Line[5]) Col = float(Line[7]) SubjectLength = int(Line[10]) qcovs = Col/QueryLength*100 scovs = Col/SubjectLength*100 if (evalue <= HHsuite_evalue_Cutoff and pvalue <= HHsuite_pvalue_Cutoff and qcovs >= HHsuite_QueryCoverage_Cutoff and scovs >= HHsuite_SubjectCoverage_Cutoff): Pair = ", ".join(sorted(map(str,[PPHMM_i, PPHMM_j]))) if Pair in SeenPair: #If the pair has already been seen... if PPHMMSimScore > PPHMMSimScoreCondensedMat[SeenPair[Pair]][2]: #and if the new PPHMMSimScore is higher... PPHMMSimScoreCondensedMat[SeenPair[Pair]][2] = PPHMMSimScore else: SeenPair[Pair] = SeenPair_i PPHMMSimScoreCondensedMat.append([PPHMM_i, PPHMM_j, PPHMMSimScore]) SeenPair_i = SeenPair_i+1 _ = subprocess.Popen("rm %s" %hhsearchOutFile, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() #Progress bar sys.stdout.write("\033[K" + "hhsearch: [%-20s] %d/%d PPHMMs" % ('='*int(float(PPHMM_i+1)/N_PPHMMs*20), PPHMM_i+1, N_PPHMMs) + "\r") sys.stdout.flush() sys.stdout.write("\033[K") sys.stdout.flush() PPHMMSimScoreCondensedMat = np.array(PPHMMSimScoreCondensedMat) PPHMMSimScoreMat = np.zeros((N_PPHMMs, N_PPHMMs)) PPHMMSimScoreMat[map(int, PPHMMSimScoreCondensedMat[:,0]), map(int, PPHMMSimScoreCondensedMat[:,1])] = map(float, PPHMMSimScoreCondensedMat[:,2]) PPHMMSimScoreMat[map(int, PPHMMSimScoreCondensedMat[:,1]), map(int, PPHMMSimScoreCondensedMat[:,0])] = map(float, PPHMMSimScoreCondensedMat[:,2]) PPHMMSimScoreCondensedMat = np.array([PPHMMSimScorePair for PPHMMSimScorePair in PPHMMSimScoreCondensedMat if PPHMMSimScorePair[0] < PPHMMSimScorePair[1]]) PPHMMSimScoreCondensedMatFile = hhsearchDir+"/PPHMMSimScoreCondensedMat.txt" np.savetxt( fname = PPHMMSimScoreCondensedMatFile, X = PPHMMSimScoreCondensedMat, fmt = '%s', delimiter= "\t", header = "PPHMM_i\tPPHMM_j\tPPHMMSimScore") print "\t\t\tCluster PPHMMs based on hhsearch scores, using the MCL algorithm" #------------------------------------------------------------------------------- PPHMMClustersFile = hhsearchDir+"/PPHMMClusters.txt" _ = subprocess.Popen("mcl %s --abc -o %s -I %s" %(PPHMMSimScoreCondensedMatFile, PPHMMClustersFile, PPHMMClustering_MCLInflation), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) _, out = _.communicate() SeenProtIDList = [] with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt: for Cluster in PPHMMClusters_txt.readlines(): SeenProtIDList.extend(Cluster.split("\n")[0].split("\t")) with open(PPHMMClustersFile, 'a') as PPHMMClusters_txt: PPHMMClusters_txt.write("\n".join(list(set(map(str,map(float,range(0, N_PPHMMs))))-set(SeenProtIDList)))) print "\t\t\tCheck if there are alignments to be merged" #------------------------------------------------------------------------------- with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt: N_PPHMMs_AfterMerging = len(PPHMMClusters_txt.readlines()) if N_PPHMMs_AfterMerging == N_PPHMMs: print "\t\t\t\tNo alignments to be merged. Stop alignment merging process" _ = subprocess.Popen("rm -rf %s" %hhsearchDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() break else: print "\t\t\t\tMerge %d alignments to make %d alignments" %(N_PPHMMs, N_PPHMMs_AfterMerging) print "\t\t\tMerge protein alignments and remake HHsuite PPHMMs" #------------------------------------------------------------------------------- SelfSimScoreList = PPHMMSimScoreMat.diagonal() PPHMMDissimScoreMat = 1 - np.transpose(PPHMMSimScoreMat**2/SelfSimScoreList)/SelfSimScoreList PPHMMDissimScoreMat[PPHMMDissimScoreMat<0] = 0 AfterMergingPPHMM_IndexList = [] AfterMergingPPHMM_i = 1.0 with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt: for PPHMMCluster in PPHMMClusters_txt.readlines(): PPHMMCluster = map(int, map(float, PPHMMCluster.split("\n")[0].split("\t"))) AfterMergingPPHMM_IndexList.append(min(PPHMMCluster)) if len(PPHMMCluster) >= 2: PPHMMDissimScoreMat_Subset = PPHMMDissimScoreMat[PPHMMCluster][:,PPHMMCluster] PPHMMTreeNewick = DistMat2Tree (DistMat = PPHMMDissimScoreMat_Subset, LeafList= PPHMMCluster, Dendrogram_LinkageMethod = "average") PPHMMTreeNewick = Tree(PPHMMTreeNewick) _ = PPHMMTreeNewick.ladderize() PPHMMTreeNewick = PPHMMTreeNewick.write(format = 9) while True: m = re.search(r"\((\d+),(\d+)\)", PPHMMTreeNewick) if not m: _ = subprocess.Popen("muscle -in %s -out %s -refine" %( ClusterFile_i, ClusterFile_i), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) err, out = _.communicate() break PPHMM_i, PPHMM_j = sorted([int(m.group(1)),int(m.group(2))]) PPHMMTreeNewick = re.sub(r"\((\d+),(\d+)\)", str(PPHMM_i), PPHMMTreeNewick, count=1) ClusterFile_i = ClustersDir+"/Cluster_%s.fasta" %PPHMM_i ClusterFile_j = ClustersDir+"/Cluster_%s.fasta" %PPHMM_j HHsuite_PPHMMFile_j = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_j _ = subprocess.Popen("muscle -profile -in1 %s -in2 %s -out %s -gapopen %s -gapextend %s" %( ClusterFile_i, ClusterFile_j, ClusterFile_i, MUSCLE_GapOpenCost, MUSCLE_GapExtendCost), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) err, out = _.communicate() _ = subprocess.Popen("rm %s %s" %(ClusterFile_j, HHsuite_PPHMMFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() Cluster_MetaDataDict[PPHMM_i]["Cluster"] = Cluster_MetaDataDict[PPHMM_i]["Cluster"] + Cluster_MetaDataDict[PPHMM_j]["Cluster"] Cluster_MetaDataDict[PPHMM_i]["DescList"] = Cluster_MetaDataDict[PPHMM_i]["DescList"] + Cluster_MetaDataDict[PPHMM_j]["DescList"] Cluster_MetaDataDict[PPHMM_i]["TaxoLists"] = Cluster_MetaDataDict[PPHMM_i]["TaxoLists"] + Cluster_MetaDataDict[PPHMM_j]["TaxoLists"] del Cluster_MetaDataDict[PPHMM_j] HHsuite_PPHMMFile_i = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_i Cluster_MetaDataDict[PPHMM_i]["AlignmentLength"] = AlignIO.read(ClusterFile_i, "fasta").get_alignment_length() _ = subprocess.Popen("hhmake -i %s -o %s -v 0 -seq %s -name Cluster_%s -id 100 -M 50" %( ClusterFile_i, HHsuite_PPHMMFile_i, len(Cluster_MetaDataDict[PPHMM_i]["Cluster"])+1, PPHMM_i), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if err != "": print "Something is wrong with constructing a PPHMM from cluster %s" %PPHMM_i print "#"*50+"out"+"#"*50 print out print "#"*50+"err"+"#"*50 print err print "_"*100 while True: Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y") if Input == "N" or Input == "n": raise SystemExit("GRAViTy terminated.") elif Input == "Y" or Input == "y": print "Continue GRAViTy." break else: print "Input can only be 'Y' or 'N'." elif len(PPHMMCluster) == 1: pass #Progress bar sys.stdout.write("\033[K" + "Merge alignments and make new PPHMMs: [%-20s] %d/%d PPHHMs" % ('='*int(AfterMergingPPHMM_i/N_PPHMMs_AfterMerging*20), AfterMergingPPHMM_i, N_PPHMMs_AfterMerging) + "\r") sys.stdout.flush() AfterMergingPPHMM_i = AfterMergingPPHMM_i + 1 sys.stdout.write("\033[K") sys.stdout.flush() print "\t\t\tRename protein alignments and their associated PPHMMs" #------------------------------------------------------------------------------- AfterMergingPPHMM_IndexList = sorted(AfterMergingPPHMM_IndexList) AfterMergingPPHMM_i = 0 for PPHMM_i in AfterMergingPPHMM_IndexList: Cluster_MetaDataDict[AfterMergingPPHMM_i] = Cluster_MetaDataDict.pop(PPHMM_i) ClusterFile_i = ClustersDir+"/Cluster_%s.fasta" %PPHMM_i ClusterFile_j = ClustersDir+"/Cluster_%s.fasta" %AfterMergingPPHMM_i _ = subprocess.Popen("mv %s %s" %(ClusterFile_i, ClusterFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() HHsuite_PPHMMFile_i = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_i HHsuite_PPHMMFile_j = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %AfterMergingPPHMM_i _ = subprocess.Popen("mv %s %s" %(HHsuite_PPHMMFile_i, HHsuite_PPHMMFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() with open(HHsuite_PPHMMFile_j, "r+") as HHsuite_PPHMM_txt: contents = HHsuite_PPHMM_txt.readlines() contents[1] = "NAME Cluster_%s\n" %AfterMergingPPHMM_i contents = "".join(contents) HHsuite_PPHMM_txt.seek(0) #Put cursor at the beginning of the file HHsuite_PPHMM_txt.write(contents) #Write the contents HHsuite_PPHMM_txt.truncate() #Delete everything after the cursor AfterMergingPPHMM_i = AfterMergingPPHMM_i + 1 print "\t\t\tRebuild the HHsuite PPHMM database\n" #------------------------------------------------------------------------------- _ = subprocess.Popen("rm %s_hhm.ffdata %s_hhm.ffindex" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() if list(set(map(lambda f: f.split(".")[-1], os.listdir(HHsuite_PPHMMDir))))!=["hhm"]: print "There are some other files/folders other than HHsuite PPHMMs in the folder %s. Remove them first." %HHsuite_PPHMMDir _ = subprocess.Popen("ffindex_build -s %s_hhm.ffdata %s_hhm.ffindex %s" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB, HHsuite_PPHMMDir), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() _ = subprocess.Popen("rm -rf %s" %hhsearchDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() AlignmentMerging_i_round = AlignmentMerging_i_round + 1 print "\tAlignment merging is done. Delete HHsuite shelve directory" #------------------------------------------------------------------------------- _ = subprocess.Popen("rm -rf %s" %HHsuiteDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) out, err = _.communicate() ################################################################################ print "- Make HMMER PPHMMDB and its summary file" ################################################################################ (ClusterIDList, ClusterDescList, ClusterSizeList, ClusterProtSeqIDList, ClusterSizeByTaxoGroupingList, ClusterSizeByProtList) = Make_HMMER_PPHMM_DB( HMMER_PPHMMDir = HMMER_PPHMMDir, HMMER_PPHMMDB = HMMER_PPHMMDB, ClustersDir = ClustersDir, Cluster_MetaDataDict = Cluster_MetaDataDict) ''' if IncludeIncompleteGenomes == True: ################################################################################ print "- Save variables to PPHMMDBConstruction.AllGenomes.shelve" ################################################################################ VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.AllGenomes.shelve" elif IncludeIncompleteGenomes == False: ################################################################################ print "- Save variables to PPHMMDBConstruction.CompleteGenomes.shelve" ################################################################################ VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.CompleteGenomes.shelve" ''' VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.shelve" Parameters = shelve.open(VariableShelveFile,"n") for key in [ "ClusterIDList", "ClusterDescList", "ClusterSizeList", "ClusterProtSeqIDList", "ClusterSizeByTaxoGroupingList", "ClusterSizeByProtList", ]: try: Parameters[key] = locals()[key] print "\t"+key except TypeError: pass Parameters.close()
def write_AlignIO_dna(): """Convert opuntia.aln to a phylip file""" assert 1 == AlignIO.convert("Clustalw/opuntia.aln", "clustal", "Phylip/opuntia.phy", "phylip")
taxa = args.taxa taxa = taxa.split(",") numfiles = len(glob.glob(files)) numtaxa = len(taxa) line1 = '{0} fasta files and {1} taxa found, alignments will be concatenated and written to {2}\n'.format( numfiles, numtaxa, outfile) print(line1) if numfiles > 0: cataln = MultipleSeqAlignment([]) for taxon in taxa: cataln.add_sequence(taxon, "") # make alignment with all required taxa for fasta in glob.glob(files): fastaname = fasta.split('/')[-1] # get fasta name without path aln = AlignIO.read(fasta, "fasta") # extract alignment from fasta seqLen = aln.get_alignment_length() newaln = MultipleSeqAlignment([]) seq = "X" for catrec in cataln: # for each taxon catid = str(catrec.id) for rec in aln: if str( rec.id ) == catid: # find sequence in fasta alignment if it's there seq = str(rec.seq) if seq == "X": seq = ("N" * seqLen ) # if not make a sequence of Ns of the correct length catseq = str(