def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1): # TODO: add checkpoint to avoid repeating to_phy = congen for name, data in ndf.groupby('saccver'): # mi = data.sstart.min() # ma = data.send.max() tx = data.staxid.iloc[0] try: seq = refs['>%s' % name].replace('\n', '').strip() # [mi-1:ma+1] except KeyError: name = name.split('|')[0] seq = refs['>%s' % name].replace('\n', '').strip() to_phy += '>%d.%s\n%s\n' % (tx, name, seq) with shelve.open(fasta) as dic: for h, s in dic.items(): if h.strip()[1:] in targetids: print(h) to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', '')) else: print(h, 'not in') aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy) trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps) tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm) tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';' t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0]) with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \ as tp: al.write(trm) t.write(outfile='%s.tree' % pref) dill.dump(t, tp) tax2 = t.annotate_ncbi_taxa() fix_species(t) print(t) return t, tax2
def test_species(self): """ tests if node.species and ncbi_query are working """ # test node.species species_tree = PhyloTree( """(Felis_catus_1:1, (Homo_sapiens_1:1, Pan_troglodytes_1:1), Saccharomyces_cerevisiae_1:1);""", format=1) species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '') pattern0 = """('', (' len(set(["sapiens","pygmaeus"]) & species(@))>0', Pan_troglodytes_1) );""" pattern0 = TreePattern(pattern0) root = species_tree.get_tree_root() self.assertEqual(list(pattern0.find_match(species_tree)), [root]) # test ncbi taxonomy ncbi = NCBITaxa() taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name) taxonomy_tree.annotate_ncbi_taxa() root = taxonomy_tree.get_tree_root() pattern1 = """ ' @.sci_name == "Euarchontoglires" ';""" pattern2 = """ (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ') ' @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage'; """ pattern1 = TreePattern(pattern1) pattern2 = TreePattern(pattern2) match1 = pattern1.find_match(taxonomy_tree) match2 = pattern2.find_match(taxonomy_tree) self.assertEqual(list(match1), [root]) self.assertEqual(list(match2), [root])
from ete3 import PhyloTree from ete3 import NCBITaxa from Bio import SeqIO from Bio.SeqRecord import SeqRecord input_tree = sys.argv[1] input_fasta = sys.argv[2] output_fasta_ordered_select = sys.argv[3] output_fasta_ordered_all = sys.argv[4] # There's a way to save these extra attributes, but it's a bit awkward (not supported by newick format) # So we fetch them anew each time. ncbi = NCBITaxa() tree = PhyloTree(input_tree, sp_naming_function=lambda name: name.split('.', 1)[0]) tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa() print tree.get_ascii(attributes=["name", "sci_name", "taxid"]) record_dict = SeqIO.to_dict(SeqIO.parse(input_fasta, "fasta")) # H**o sapiens # Macaca mulatta # Canis lupus familiaris # Mus musculus # Gallus gallus # Anolis carolinensis # Danio rerio sorted_fasta_select = [] sorted_fasta_select.append( SeqRecord(record_dict["9606.ENSP00000261448"].seq, "H**o sapiens", '', '')) sorted_fasta_select.append(
def run(args): from ete3 import Tree, PhyloTree features = set() for nw in args.src_tree_iterator: if args.ncbi: tree = PhyloTree(nw) features.update([ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage", "lineage" ]) tree.annotate_ncbi_taxa(args.taxid_attr) else: tree = Tree(nw) type2cast = { "str": str, "int": int, "float": float, "set": set, "list": list } for annotation in args.feature: aname, asource, amultiple, acast = None, None, False, str for field in annotation: try: key, value = list(map(str.strip, field.split(":"))) except Exception: raise ValueError("Invalid feature option [%s]" % field) if key == "name": aname = value elif key == "source": asource = value elif key == "multiple": #append amultiple = value elif key == "type": try: acast = type2cast[value] except KeyError: raise ValueError("Invalid feature type [%s]" % field) else: raise ValueError("Unknown feature option [%s]" % field) if not aname and not asource: ValueError( 'name and source are required when annotating a new feature [%s]' % annotation) features.add(aname) for line in open(asource, 'rU'): line = line.strip() if not line or line.startswith('#'): continue nodenames, attr_value = list(map(str.strip, line.split('\t'))) nodenames = list(map(str.strip, nodenames.split(','))) relaxed_grouping = True if nodenames[0].startswith('!'): relaxed_grouping = False nodenames[0] = nodenames[0][1:] if len(nodenames) > 1: target_node = tree.get_common_ancestor(nodenames) if not relaxed_grouping: pass # do something else: target_node = tree & nodenames[0] if hasattr(target_node, aname): log.warning('Overwriting annotation for node" [%s]"' % nodenames) else: target_node.add_feature(aname, acast(attr_value)) dump(tree, features=features)