def testNewickExport(self): n = pathmap.nexson_obj('10/pg_10.json') newick = extract_tree( n, 'tree3', PhyloSchema('newick', tip_label='ot:ottTaxonName', bracket_ingroup=True)) self.assertTrue('[pre-ingroup-marker' in newick) self.assertTrue('[post-ingroup-marker' in newick) self.assertTrue(newick.startswith('(')) self.assertTrue('*tip #1 not mapped' in newick) self.assertTrue('*tip #2 not mapped' in newick) self.assertTrue('*tip #3 not mapped' not in newick) newick = extract_tree( n, 'tree3', PhyloSchema('newick', tip_label='ot:ottTaxonName')) self.assertTrue('[pre-ingroup-marker' not in newick) self.assertTrue('[post-ingroup-marker' not in newick) self.assertTrue('*tip #1 not mapped' in newick) self.assertTrue('*tip #2 not mapped' in newick) self.assertTrue('*tip #3 not mapped' not in newick) self.assertTrue(newick.startswith('(')) newick = extract_tree( n, 'tree3', PhyloSchema('newick', tip_label='ot:originallabel')) self.assertTrue('[pre-ingroup-marker' not in newick) self.assertTrue('[post-ingroup-marker' not in newick) self.assertTrue('*tip #' not in newick)
def testNewickExport(self): n = pathmap.nexson_obj('10/pg_10.json') newick = extract_tree(n, 'tree3', PhyloSchema('newick', tip_label='ot:ottTaxonName', bracket_ingroup=True)) self.assertTrue('[pre-ingroup-marker' in newick) self.assertTrue('[post-ingroup-marker' in newick) self.assertTrue(newick.startswith('(')) newick = extract_tree(n, 'tree3', PhyloSchema('newick', tip_label='ot:ottTaxonName')) self.assertTrue('[pre-ingroup-marker' not in newick) self.assertTrue('[post-ingroup-marker' not in newick) self.assertTrue(newick.startswith('('))
def generate_ATT_from_phylesystem(aln, workdir, study_id, tree_id, phylesystem_loc='api'): """gathers together tree, alignment, and study info - forces names to otu_ids. Outputs AlignTreeTax object. an alignemnt, a Input can be either a study ID and tree ID from OpenTree Alignemnt need to be a Dendropy DNA character matrix!""" #TODO CHECK ARGS assert(isinstance(aln, datamodel.charmatrixmodel.DnaCharacterMatrix)) for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") #Forcing all spaces to underscore UGH nexson = get_nexson(study_id, phylesystem_loc) ott_ids = get_subtree_otus(nexson, tree_id=tree_id, subtree_id="ingroup", return_format="ottid") ott_mrca = get_mrca_ott(ott_ids) newick = extract_tree(nexson, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:originalLabel")) newick = newick.replace(" ", "_") #UGH Very heavy handed, need to make sure happens on alignement side as well. tre = Tree.get(data=newick, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) otus = get_subtree_otus(nexson, tree_id=tree_id) otu_dict = {} orig_lab_to_otu = {} treed_taxa = {} for otu_id in otus: otu_dict[otu_id] = extract_otu_nexson(nexson, otu_id)[otu_id] otu_dict[otu_id]['^physcraper:status'] = "original" otu_dict[otu_id]['^physcraper:last_blasted'] = "1900/01/01" orig = otu_dict[otu_id].get(u'^ot:originalLabel').replace(" ", "_") orig_lab_to_otu[orig] = otu_id treed_taxa[orig] = otu_dict[otu_id].get(u'^ot:ottId') for tax in aln.taxon_namespace: try: tax.label = orig_lab_to_otu[tax.label].encode('ascii') except KeyError: sys.stderr.write("{} doesn't have an otu id. It is being removed from the alignement. This may indicate a mismatch between tree and alignement\n".format(tax.label)) #need to prune tree to seqs and seqs to tree... otu_newick = tre.as_string(schema="newick") return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir) #newick should be bare, but alignement should be DNACharacterMatrix
) import dendropy configfi = "aws.config" study_id = "ot_350" tree_id = "Tr53297" workdir = "scrape_ot_350" # Read in the configuration information conf = physcraper.ConfigObj(configfi) #Get an existing tree from the Open Tree of life, and convert it to newick format nexson = physcraper.opentree_helpers.get_nexson(study_id, 'api') newick = extract_tree( nexson, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:originalLabel")) tre = dendropy.Tree.get(data=newick, schema="newick", preserve_underscores=True) #Pull down an alignment from treebase. dataset = physcraper.opentree_helpers.get_dataset_from_treebase( study_id, phylesystem_loc='api') aln = None ##order of data matrices is arbitratry, so we choose one that matches the tree length for mat in dataset.char_matrices: if len(mat) == len(tre.taxon_namespace):
runname = sys.argv[5] #Fixed values E_VALUE_THRESH = 0.04 ott_ncbi = "../ott_ncbi" #TODO config file Entrez.email = "*****@*****.**" phy = Phylesystem() n = phy.return_study(study_id)[0] api_wrapper.study.get(study_id, tree=tree_id) ##This is a weird way to get the ingroup node, but I need the OTT ids anyhow. m = extract_tree(n, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:ottId"), subtree_id="ingroup") otu_dict = gen_otu_dict(n) ottids = [] for oid, o in otu_dict.items(): try: ottid = o[u'^ot:ottId'] if ("{}:".format(ottid) in m) or ("{})".format(ottid) in m) or ("{},".format(ottid) in m): ottids.append(ottid) else: print(o) except: pass
def testTreeExport(self): n = pathmap.nexson_obj('10/pg_10.json') newick = extract_tree( n, 'tree3', PhyloSchema('nexus', tip_label='ot:ottTaxonName')) self.assertTrue(newick.startswith('#'))
def generate_ATT_from_phylesystem(aln, workdir, config_obj, study_id, tree_id, phylesystem_loc='api', ingroup_mrca=None): """gathers together tree, alignment, and study info - forces names to otu_ids. Study and tree ID's can be obtained by using python ./scripts/find_trees.py LINEAGE_NAME Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in. :param aln: dendropy :class:`DnaCharacterMatrix <dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix>` alignment object :param workdir: path to working directory :param config_obj: config class containing the settings :param study_id: OToL study id of the corresponding phylogeny which shall be updated :param tree_id: OToL corresponding tree ID as some studies have several phylogenies :param phylesystem_loc: access the github version of the OpenTree data store, or a local clone :param ingroup_mrca: optional. OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny) :return: object of class ATT """ assert isinstance(aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \ "your alignment `%s` ist not of type DnaCharacterMatrix" % aln for tax in aln.taxon_namespace: tax.label = tax.label.replace(" ", "_") # Forcing all spaces to underscore nexson = get_nexson(study_id, phylesystem_loc) newick = extract_tree( nexson, tree_id, PhyloSchema('newick', output_nexml2json='1.2.1', content="tree", tip_label="ot:originalLabel")) newick = newick.replace( " ", "_" ) # UGH Very heavy handed, need to make sure happens on alignment side as well. tre = Tree.get(data=newick, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) # this gets the taxa that are in the subtree with all of their info - ott_id, original name, otus = get_subtree_otus(nexson, tree_id=tree_id) otu_dict = {} orig_lab_to_otu = {} treed_taxa = {} for otu_id in otus: otu_dict[otu_id] = extract_otu_nexson(nexson, otu_id)[otu_id] otu_dict[otu_id]["^physcraper:status"] = "original" otu_dict[otu_id]["^physcraper:last_blasted"] = None orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_") orig_lab_to_otu[orig] = otu_id treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId") for tax in aln.taxon_namespace: if tax.label in otu_dict: sys.stdout.write("{} aligned\n".format(tax.label)) else: try: tax.label = orig_lab_to_otu[tax.label].encode("ascii") except KeyError: sys.stderr.write( "{} doesn't have an otu id. It is being removed from the alignment. " "This may indicate a mismatch between tree and alignment\n" .format(tax.label)) # need to prune tree to seqs and seqs to tree... otu_newick = tre.as_string(schema="newick") ott_ids = get_subtree_otus(nexson, tree_id=tree_id, subtree_id="ingroup", return_format="ottid") if ingroup_mrca: if type(ingroup_mrca) == list: ott_ids = set(ingroup_mrca) ott_mrca = get_mrca_ott(ott_ids) else: ott_mrca = int(ingroup_mrca) elif ott_ids: # if no ingroup is specified, ott_ids will be none ott_mrca = get_mrca_ott(ott_ids) else: # just get the mrca for teh whole tree ott_mrca = get_mrca_ott( [otu_dict[otu_id].get(u"^ot:ottId") for otu_id in otu_dict]) workdir = os.path.abspath(workdir) return physcraper.aligntreetax.AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir, config_obj=config_obj)
def testTreeExport(self): n = pathmap.nexson_obj('10/pg_10.json') newick = extract_tree(n, 'tree3', PhyloSchema('nexus', tip_label='ot:ottTaxonName')) self.assertTrue(newick.startswith('#'))