def getOpenTreesFromOneZoom(OpenTreeFile, output_dir, include_var, phy_files, verbose=False): '''Python routine to get OToL subtrees from phy files. If include_var is a number, treat it as a recursion depth, otherwise a dictionary of names to keep. The parameter phy_files should be an iterable list of .phy or .PHY filenames''' from numbers import Number ExtractionUtility = os.path.join(os.path.dirname(os.path.realpath(__file__)), "subtree_extract.pl") #find all nodes that end in ott plus a number and (optionally) some other numbers starting with underscore, ending in # an at sign followed optionally by another number (giving the max depth) are OpenTree subnode IDs # the first number after the ott is always the ott number to use as the filename. # the 1) ott123 2) ott_123: use the name, not the ott id ottRE = re.compile(r"^(.*)_ott([-~\d]+)\@(\d*)$") id_pattern = re.compile(r"(\d*)~?([-\d]*)$") if not os.path.isfile(OpenTreeFile): OpenTreeURL = "http://files.opentreeoflife.org/synthesis/opentree9.1/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" warn("Could not find the OpenTree file {}. Do you want to download it from {}".format(OpenTreeFile, OpenTreeURL)) if (input("Press Enter to accept, or N to abort... ") == "N"): sys.exit(0) if not get_species_level_tree(OpenTreeFile): warn("Could not get the Open Tree of Life newick file to save at {}".format(OpenTreeFile)) if isinstance(include_var, Number): keep = True #means keep all of the species down to a certain depth, i.e. do not use an include list default_recursion_depth = include_var else: keep = include_var default_recursion_depth = float('nan') for file in phy_files: if file == "-": trees = TreeList.get_from_stream(sys.stdin, schema="newick", preserve_underscores=True, rooting='default-rooted') file = "<stdin>" else: try: with open(file, 'r', encoding="utf8") as stream: trees = TreeList.get_from_stream(stream, schema="newick", preserve_underscores=True, rooting='default-rooted') except Exception as e: trees = [] warn("Problem reading tree from {}: {}".format(file, e)) for tree in trees: for i, include_ott in enumerate(tree.preorder_node_iter( lambda node: True if hasattr(node, "taxon") and node.taxon is not None and ottRE.search(node.taxon.label) else False )): if i==0: print("\n//;# == {} ==, from file {}".format(tree.seed_node.label, file)) #each of these is a file to @include #first get recursion depth from the end of the string match = ottRE.search(include_ott.taxon.label) name = match.group(1) ottIDs = match.group(2) if default_recursion_depth < 0: recursion_depth = abs(default_recursion_depth) else: recursion_depth = float(match.group(3)) if len(match.group(3)) else default_recursion_depth match = id_pattern.match(ottIDs) if match: subfile_name = match.group(1) or name del_otts = (match.group(2) or '').split('-') #split by minus signs base_ott = del_otts.pop(0) or match.group(1) #first number after '=' is the tree to extract. system_call = [ExtractionUtility] if keep==True and math.isfinite(recursion_depth): system_call.append("-d={}".format(int(recursion_depth))) system_call.append(os.path.relpath(OpenTreeFile, output_dir)) system_call.append(base_ott) OpenSubTreeFile = os.path.join(output_dir, base_ott + ".nwk") if verbose: warn("For "+include_ott.taxon.label+": extracting tree into " + OpenSubTreeFile, prefix=''); call(system_call, cwd=output_dir) #should create many ottID.nwk files OutputFilename = os.path.join(output_dir, subfile_name + ".phy") if os.path.isfile(OpenSubTreeFile): removed = "" if len(del_otts)==0 else " removed {}".format(del_otts) subtree = prune_tree(OpenSubTreeFile, keep, del_otts) if keep == True: if verbose: warn("Found file {} with {} leaf taxa,{} and extracted to max depth: {}".format(OpenSubTreeFile, len(subtree.taxon_namespace), removed, recursion_depth), prefix='') else: subtree_size = len(subtree.leaf_nodes()) if verbose: warn("Found file with {} leaf taxa, {}, and simplified to only selected taxa ({} {})".format(len(subtree.taxon_namespace), removed, subtree_size, 'leaf' if subtree_size==1 else 'leaves', del_otts), prefix='') '''this is not needed until the OpenTree has branch lengths subtree.ultrametricize() #maybe use subtree.calc_node_ages() warn("ultrametricized\n", prefix="") #subtree->get_root()->set_branch_length(undef); stem_height = include_ott.edge_length - subtree.calc_tree_height if (stem_height < 0):''' if verbose: warn("Now writing to {}".format(OutputFilename), prefix='') with open(OutputFilename, 'w', encoding='UTF-8') as outputstream: subtree.write_to_stream(outputstream,'newick', unquoted_underscores=True, suppress_rooting=True) max_tree_height = 0 if include_ott.edge_length is not None and include_ott.edge_length > max_tree_height: stem_height = include_ott.edge_length- max_tree_height else: stem_height = 0 # print(r'$tree.substitute_with_fn_last("{}_ott{}@\\d*", {}, "{}", {}); //;# "user/OpenTree/{}");'.format(name, ottIDs, stem_height, name, len(subtree.taxon_namespace), OutputFilename)) # OpenTrees are currently not dated, so we should omit the 'stem_length' value, so that the node becomes # 'date unknown' print(r"$tree.substitute('{}_ott{}@\\d*', '{}');".format(name, ottIDs, OutputFilename)) else: warn("File " + OpenSubTreeFile + " does not exist, skipping\n")