def generate_ATT_from_phylesystem(aln, workdir, study_id, tree_id, phylesystem_loc='api'): """gathers together tree, alignment, and study info - forces names to otu_ids. Outputs AlignTreeTax object. an alignemnt, a Input can be either a study ID and tree ID from OpenTree Alignemnt need to be a Dendropy DNA character matrix!""" #TODO CHECK ARGS assert(isinstance(aln, datamodel.charmatrixmodel.DnaCharacterMatrix)) for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") #Forcing all spaces to underscore UGH nexson = get_nexson(study_id, phylesystem_loc) ott_ids = get_subtree_otus(nexson, tree_id=tree_id, subtree_id="ingroup", return_format="ottid") ott_mrca = get_mrca_ott(ott_ids) newick = extract_tree(nexson, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:originalLabel")) newick = newick.replace(" ", "_") #UGH Very heavy handed, need to make sure happens on alignement side as well. tre = Tree.get(data=newick, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) otus = get_subtree_otus(nexson, tree_id=tree_id) otu_dict = {} orig_lab_to_otu = {} treed_taxa = {} for otu_id in otus: otu_dict[otu_id] = extract_otu_nexson(nexson, otu_id)[otu_id] otu_dict[otu_id]['^physcraper:status'] = "original" otu_dict[otu_id]['^physcraper:last_blasted'] = "1900/01/01" orig = otu_dict[otu_id].get(u'^ot:originalLabel').replace(" ", "_") orig_lab_to_otu[orig] = otu_id treed_taxa[orig] = otu_dict[otu_id].get(u'^ot:ottId') for tax in aln.taxon_namespace: try: tax.label = orig_lab_to_otu[tax.label].encode('ascii') except KeyError: sys.stderr.write("{} doesn't have an otu id. It is being removed from the alignement. This may indicate a mismatch between tree and alignement\n".format(tax.label)) #need to prune tree to seqs and seqs to tree... otu_newick = tre.as_string(schema="newick") return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir) #newick should be bare, but alignement should be DNACharacterMatrix
def _get_mrca(self): if not _study_get: _get_study() ott_ids = get_subtree_otus(self.nexson, tree_id=self.tree_id) #TODO are these ottids or OTUs? self.mrca_node = tree_of_life.mrca(ott_ids=ottids, wrap_response=True) sys.stdout.write("mrca_node found, {}\n".format(mrca_node.nearest_taxon.ott_id)) self._found_mrca = 1
def generate_ATT_from_phylesystem(aln, workdir, config_obj, study_id, tree_id, phylesystem_loc='api', ingroup_mrca=None): """gathers together tree, alignment, and study info - forces names to otu_ids. Study and tree ID's can be obtained by using python ./scripts/find_trees.py LINEAGE_NAME Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in. :param aln: dendropy :class:`DnaCharacterMatrix <dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix>` alignment object :param workdir: path to working directory :param config_obj: config class containing the settings :param study_id: OToL study id of the corresponding phylogeny which shall be updated :param tree_id: OToL corresponding tree ID as some studies have several phylogenies :param phylesystem_loc: access the github version of the OpenTree data store, or a local clone :param ingroup_mrca: optional. OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny) :return: object of class ATT """ assert isinstance(aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \ "your alignment `%s` ist not of type DnaCharacterMatrix" % aln for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") # Forcing all spaces to underscore nexson = get_nexson(study_id, phylesystem_loc) newick = extract_tree( nexson, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:originalLabel")) newick = newick.replace( " ", "_" ) # UGH Very heavy handed, need to make sure happens on alignment side as well. tre = Tree.get(data=newick, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) # this gets the taxa that are in the subtree with all of their info - ott_id, original name, otus = get_subtree_otus(nexson, tree_id=tree_id) otu_dict = {} orig_lab_to_otu = {} treed_taxa = {} for otu_id in otus: otu_dict[otu_id] = extract_otu_nexson(nexson, otu_id)[otu_id] otu_dict[otu_id]["^physcraper:status"] = "original" otu_dict[otu_id]["^physcraper:last_blasted"] = None orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_") orig_lab_to_otu[orig] = otu_id treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId") for tax in aln.taxon_namespace: if tax.label in otu_dict: sys.stdout.write("{} aligned\n".format(tax.label)) else: try: tax.label = orig_lab_to_otu[tax.label].encode("ascii") except KeyError: sys.stderr.write( "{} doesn't have an otu id. It is being removed from the alignment. " "This may indicate a mismatch between tree and alignment\n" .format(tax.label)) # need to prune tree to seqs and seqs to tree... otu_newick = tre.as_string(schema="newick") ott_ids = get_subtree_otus(nexson, tree_id=tree_id, subtree_id="ingroup", return_format="ottid") if ingroup_mrca: if type(ingroup_mrca) == list: ott_ids = set(ingroup_mrca) ott_mrca = get_mrca_ott(ott_ids) else: ott_mrca = int(ingroup_mrca) elif ott_ids: # if no ingroup is specified, ott_ids will be none ott_mrca = get_mrca_ott(ott_ids) else: # just get the mrca for teh whole tree ott_mrca = get_mrca_ott( [otu_dict[otu_id].get(u"^ot:ottId") for otu_id in otu_dict]) workdir = os.path.abspath(workdir) return physcraper.aligntreetax.AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir, config_obj=config_obj)