def load_tab_delimited_trait_table(trait_table_fp, verbose=False): """Load a tab delimited trait table for picrust""" input_trait_table = open(trait_table_fp, "U") if verbose: print "Parsing trait table..." # Find which taxa are to be used in tests # (by default trait table taxa) trait_table_header, trait_table_fields = parse_trait_table(input_trait_table) label_conversion_fns = set_label_conversion_fns(verbose=verbose) trait_table_fields = convert_trait_table_entries( trait_table_fields, value_conversion_fns=[], label_conversion_fns=label_conversion_fns ) trait_table_fields = [t for t in trait_table_fields] if verbose: print "Number of trait table fields with single quotes:", len([t for t in trait_table_fields if "'" in t[0]]) return trait_table_header, trait_table_fields
def load_tab_delimited_trait_table(trait_table_fp, verbose=False): """Load a tab delimited trait table for picrust""" input_trait_table = open(trait_table_fp, "U") if verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) label_conversion_fns = set_label_conversion_fns(verbose=verbose) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] if verbose: print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) return trait_table_header, trait_table_fields
def reformat_tree_and_trait_table(tree,trait_table_lines,trait_to_tree_mapping,\ input_trait_table_delimiter="\t", output_trait_table_delimiter="\t",\ filter_table_by_tree_tips=True, convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,convert_to_bifurcating=False,\ add_branch_length_to_root=False, name_unnamed_nodes=True,\ remove_whitespace_from_labels = True,replace_ambiguous_states=True,\ replace_problematic_label_characters = True,min_branch_length=0.0001,\ verbose=True): """Return a full reformatted tree,pruned reformatted tree and set of trait table lines tree - a PyCogent PhyloNode tree object trait_table_lines -- the lines of a trait table, where the rows are organisms and the columns are traits (e.g. gene counts). trait_id_to_tree_mapping -- a dict keyed by trait table ids, with values of tree ids. If provided, trait table ids will be mapped to tree ids filter_table_by_tree_tips -- if True, remove trait table rows that don't map to ids on the tree convert_trait_floats_to_ints -- if True, convert floating point values in trait table cells to integers. filter_tree_by_table_entries -- if True, save only the subtree that encompasses organisms in the trait table. (equivalent to removing all tips in the tree that don't map to the trait table) convert_to_bifurcating -- if True, ensure that the tree is fully bifurcating by resolving polytomies with very short branches. add_branch_length_to_root -- if True, ensure that the root node has a minimum branch length name_unnamed_nodes -- if True, name unnamed nodes in the tree. (Useful for ensuring internal nodes can be consistently identified in both the reference and pruned trees) remove_whitespace_from_labels -- if True, replace whitespace in organism labels with underscores replace_ambiguous_states -- if True, replace various strings representing ambiguous character states, as well as '-1' or -1 (used by IMG to represent a lack of data) with 0 values. replace_problematic_table_chars -- if True, replace ':' and ';' in the results with '_', and remove double quotes. (AncSR methods like ace can't handle these characters in organism labels) min_branch_length -- set the minimum branch length for all edges in the tree. This function combines the various reformatting functions in the library into a catch-all reformatter. TODO: This function is monolithic, so despite the individual parts being tested seperately, it probably needs to be broken down into several modular parts. This would need to be done with care however, as the order of steps matters quite a bit. """ input_tree = tree #Parse lines to fields once if trait_table_lines: if verbose: print "Parsing trait table...." header_line,trait_table_fields =\ parse_trait_table(trait_table_lines,delimiter = input_trait_table_delimiter) else: if verbose: print "Found no trait table lines. Setting data and header to empty" trait_table_fields = [] header_line = '' # Tree reformatting if convert_to_bifurcating: if verbose: print "Converting tree to bifurcating...." #maximum recursion depth on large trees #Try working around this issue with a large #recursion depth limit old_recursion_limit = getrecursionlimit() setrecursionlimit(50000) input_tree = input_tree.bifurcating() # Required by most ancSR programs setrecursionlimit(old_recursion_limit) #input_tree = ensure_root_is_bifurcating(input_tree) # The below nutty-looking re-filtering step is necessary # When ensuring the root is bifurcating, internal nodes can #get moved to the tips so without additional filtering we #get unannotated tip nodes #if filter_tree_by_table_entries: # input_tree = filter_tree_tips_by_presence_in_table(input_tree,\ # trait_table_fields,delimiter=input_trait_table_delimiter) #Name unnamed nodes if name_unnamed_nodes: if verbose: print "Naming unnamed nodes in the reference tree...." input_tree=make_internal_nodes_unique(input_tree) #input_tree.nameUnnamedNodes() check_node_labels(input_tree,verbose=verbose) #Paranoid check for missing names: #if verbose: # print "Checking that all nodes were named..." #for i,n in enumerate(input_tree.preorder()): # if n.Name is None: # raise ValueError('Node #%s (in tree.preorder()) was not named!'%str(i)) #map trait table ids to tree ids if trait_to_tree_mapping: #if verbose: # print "Validating that trait --> tree mappings match tree ids..." # good,bad = validate_trait_table_to_tree_mappings(input_tree,\ # trait_to_tree_mapping.values(), verbose = True) # print "Found %i valid ids." %(len(good)) # print "Found %i invalid ids." %(len(bad)) # #if bad: # # raise RuntimeError("The following putative tree ids in mapping file aren't actually in the input tree: %s" % bad) if verbose: print "Remapping trait table ids to match tree ids...." trait_table_fields =\ remap_trait_table_organisms(trait_table_fields,trait_to_tree_mapping,\ verbose = verbose) label_conversion_fns =\ set_label_conversion_fns(remove_whitespace_from_labels=remove_whitespace_from_labels,\ replace_problematic_label_characters=replace_problematic_label_characters) value_conversion_fns = set_value_conversion_fns(replace_ambiguous_states=replace_ambiguous_states,\ convert_trait_floats_to_ints=convert_trait_floats_to_ints) #Apply both label and value converters to the trait table trait_table_fields = convert_trait_table_entries(\ trait_table_fields,\ value_conversion_fns = value_conversion_fns,\ label_conversion_fns = label_conversion_fns) #We now need to apply any formatting functions to the tree nodes as well, to ensure #that names are consistent between the two. if label_conversion_fns: input_tree = fix_tree_labels(input_tree, label_conversion_fns) #Then filter the trait table to include only tree tips if filter_table_by_tree_tips: if verbose: print "Filtering trait table ids to include only those that match tree ids...." trait_table_fields = filter_table_by_presence_in_tree(input_tree,\ trait_table_fields,delimiter=input_trait_table_delimiter) #if verbose: # print "Verifying that new trait table ids match tree:" # print "# of trait_table_lines: %i" %len(trait_table_lines) # all_tip_ids = [tip.Name for tip in input_tree.iterTips()] # print "example tree tip ids:",all_tip_ids[0:10] if filter_tree_by_table_entries: if verbose: print "filtering tree tips to match entries in trait table...." input_tree = filter_tree_tips_by_presence_in_table(input_tree,\ trait_table_fields,delimiter=input_trait_table_delimiter,\ verbose=verbose) if min_branch_length: if verbose: print "Setting a min branch length of %f throughout tree...." \ % min_branch_length input_tree = set_min_branch_length(input_tree,min_length = min_branch_length) if add_branch_length_to_root: if vebose: print "Adding a min branch length of %f to the root node...." \ % min_branch_length input_tree = add_branch_length_to_root(input_tree,root_name=input_tree.Name,\ root_length=min_branch_length) if verbose: print "Performing a final round of tree pruning to remove internal nodes with only one child...." input_tree.prune() #Format resulting trait table lines result_trait_table_lines = [header_line] result_trait_table_lines.extend([output_trait_table_delimiter.join(f) for f in trait_table_fields]) if verbose: print "Final reprocessing of trait table lines to remove trailing whitespace..." result_trait_table_lines =\ [line.strip() for line in result_trait_table_lines if line.strip()] if verbose: print "Done reformatting tree and trait table" return input_tree, result_trait_table_lines
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir,strict=False) output_table_fp = join(output_dir,trait_table_base) output_tree_fp = join(output_dir,pruned_tree_base) output_reference_tree_fp = join(output_dir,reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space":" ","tab":"\t","comma":","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" %("Tree file",tree_file) print "\t%s:%s" %("Trait table",trait_table_fp) print "\t%s:%s" %("Output tree",output_tree_fp) print "\t%s:%s" %("Output reference tree",output_reference_tree_fp) print "\t%s:%s" %("Output trait table",output_table_fp) print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root) print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus) print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter) print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter) # Begin reformatting root_name = "root" if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp,"U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping,"U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy=new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp,"U") otu_table_lines = otu_table.readlines() header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp,"w+") output_tree_file = open(output_tree_fp,"w+") output_reference_tree_file = open(output_reference_tree_fp,"w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str,lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table,"U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree,label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'"%tip_to_predict #Write tree base_name = "--".join(map(str,["test_tree",opts.method,curr_dist])) curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits)+"\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing expected trait table to:", filename f=open(filename,"w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename,"U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f=open(filename,"w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict])) filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing test trait table to:", filename f=open(filename,"w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table, "U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree, label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" % ( len(included_tips), included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" % ( tip_to_predict, included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'" % tip_to_predict #Write tree base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist])) curr_filepath = write_tree(opts.output_dir, base_name, test_tree, safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join( map(str, ["exp_traits", opts.method, curr_dist, safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits) + "\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing expected trait table to:", filename f = open(filename, "w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename, "U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend( ["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join( map(str, [ "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict ])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f = open(filename, "w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend( ["\t".join(r) + "\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join( map(str, [ "test_trait_table", opts.method, curr_dist, safe_tip_to_predict ])) filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing test trait table to:", filename f = open(filename, "w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): # Parse input to get parameters option_parser, opts, args =\ parse_command_line_parameters(**script_info) tree_file = opts.input_tree trait_table_fp = opts.input_trait_table verbose = opts.verbose #Set output base file names trait_table_base = 'trait_table.tab' pruned_tree_base = 'pruned_tree.newick' reference_tree_base = 'reference_tree.newick' output_dir = make_output_dir(opts.output_dir, strict=False) output_table_fp = join(output_dir, trait_table_base) output_tree_fp = join(output_dir, pruned_tree_base) output_reference_tree_fp = join(output_dir, reference_tree_base) #Handle parameters with more complex defaults delimiter_map = {"space": " ", "tab": "\t", "comma": ","} input_delimiter = delimiter_map[opts.input_table_delimiter] output_delimiter = delimiter_map[opts.output_table_delimiter] if verbose: print "Running with options:" print "\t%s:%s" % ("Tree file", tree_file) print "\t%s:%s" % ("Trait table", trait_table_fp) print "\t%s:%s" % ("Output tree", output_tree_fp) print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp) print "\t%s:%s" % ("Output trait table", output_table_fp) print "\t%s:%s" % ("Add branch length to root", opts.add_branch_length_to_root) print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus) print "\t%s:%s" % ("Input trait table delimiter", opts.input_table_delimiter) print "\t%s:%s" % ("Output trait table delimiter", opts.output_table_delimiter) # Begin reformatting root_name = "root" #format_for_bayestraits = True #TODO: this will become a new function in the bayestraits app controller #if format_for_bayestraits: # convert_to_nexus = True # convert_to_bifurcating = True # filter_table_by_tree_tips = True # filter_tree_by_table_entries = True # enforce_min_branch_length = True # convert_trait_floats_to_ints = True if opts.no_minimum_branch_length: min_branch_length = None else: min_branch_length = 0.0001 #Load inputs if verbose: print "Loading tree...." input_tree = DndParser(open(tree_file)) #input_tree =DndParser(open(tree_file), constructor=PicrustNode) #input_tree = load_picrust_tree(opts.input_tree,opts.verbose) if verbose: print "Loading trait table..." trait_table = open(trait_table_fp, "U") trait_table_lines = trait_table.readlines() if not trait_table_lines: raise IOError( "No lines could be loaded from file %s. Please check the input file." % trait_table_fp) #Get id mappings from mapping file if opts.tree_to_trait_mapping: if verbose: print "Loading tree to trait table mapping file..." mapping_file = open(opts.tree_to_trait_mapping, "U") trait_to_tree_mapping =\ make_id_mapping_dict(parse_id_mapping_file(mapping_file)) else: if verbose: print "No tree to trait mapping file specified. Assuming tree tip names and trait table names will match exactly." trait_to_tree_mapping = None # Call reformatting function using specified parameters # to get reference tree if opts.verbose: print """**BUILDING REFERENCE TREE (without respect to trait table)**""" new_reference_tree, not_useful_trait_table_lines =\ reformat_tree_and_trait_table(\ tree=input_tree,\ trait_table_lines = [],\ trait_to_tree_mapping = None,\ input_trait_table_delimiter= None,\ output_trait_table_delimiter= None,\ filter_table_by_tree_tips=False,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=False,\ convert_to_bifurcating=True,\ add_branch_length_to_root=False,\ name_unnamed_nodes=True,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Make a copy new_reference_tree_copy = new_reference_tree.deepcopy() if opts.verbose: print """**BUILDING PRUNED TREE AND TRAIT TABLE**""" # Call reformatting function using specified parameters new_tree, new_trait_table_lines = \ reformat_tree_and_trait_table(tree=new_reference_tree_copy,\ trait_table_lines = trait_table_lines,\ trait_to_tree_mapping = trait_to_tree_mapping,\ input_trait_table_delimiter= input_delimiter,\ output_trait_table_delimiter=output_delimiter,\ filter_table_by_tree_tips=True,\ convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,\ convert_to_bifurcating=False,\ add_branch_length_to_root=False,\ name_unnamed_nodes=False,\ min_branch_length=min_branch_length,\ verbose=opts.verbose) #Alter reference tree to only contain tips in OTU table (and of course trait table) if opts.limit_tree_to_otus_fp: if opts.verbose: print "Pruning reference tree to contain only tips in OTU table (and trait table)...." otu_table = open(opts.limit_tree_to_otus_fp, "U") otu_table_lines = otu_table.readlines() header_line, otu_table_fields = parse_trait_table( otu_table_lines, delimiter=input_delimiter, has_header=False) header_line,trait_table_fields =\ parse_trait_table(new_trait_table_lines,delimiter = input_delimiter) tips_to_keep = list(otu_table_fields) + list(trait_table_fields) tips_to_keep_in_tree = filter_table_by_presence_in_tree( new_reference_tree_copy, tips_to_keep) new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\ tips_to_keep_in_tree,verbose=opts.verbose) if opts.verbose: print "Almost finished. Writing trees and trait table to files..." #Write results to files # Open output files output_trait_table_file = open(output_table_fp, "w+") output_tree_file = open(output_tree_fp, "w+") output_reference_tree_file = open(output_reference_tree_fp, "w+") #Output trait table file if opts.verbose: print "Writing trait table to:", output_table_fp output_trait_table_file.write("\n".join(new_trait_table_lines)) trait_table.close() output_trait_table_file.close() #Output tree file if opts.verbose: print "Writing pruned tree to:", output_tree_fp if opts.convert_to_nexus is True: lines = nexus_lines_from_tree(new_tree) output_tree_file.write("\n".join(map(str, lines))) else: output_tree_file.write(new_tree.getNewick(with_distances=True)) output_tree_file.close() if opts.verbose: print "Writing reference tree to:", output_reference_tree_fp #Output reference tree file output_reference_tree_file.write( new_reference_tree.getNewick(with_distances=True)) output_reference_tree_file.close()
def reformat_tree_and_trait_table(tree,trait_table_lines,trait_to_tree_mapping,\ input_trait_table_delimiter="\t", output_trait_table_delimiter="\t",\ filter_table_by_tree_tips=True, convert_trait_floats_to_ints=False,\ filter_tree_by_table_entries=True,convert_to_bifurcating=False,\ add_branch_length_to_root=False, name_unnamed_nodes=True,\ remove_whitespace_from_labels = True,replace_ambiguous_states=True,\ replace_problematic_label_characters = True,min_branch_length=0.0001,\ verbose=True): """Return a full reformatted tree,pruned reformatted tree and set of trait table lines tree - a PyCogent PhyloNode tree object trait_table_lines -- the lines of a trait table, where the rows are organisms and the columns are traits (e.g. gene counts). trait_id_to_tree_mapping -- a dict keyed by trait table ids, with values of tree ids. If provided, trait table ids will be mapped to tree ids filter_table_by_tree_tips -- if True, remove trait table rows that don't map to ids on the tree convert_trait_floats_to_ints -- if True, convert floating point values in trait table cells to integers. filter_tree_by_table_entries -- if True, save only the subtree that encompasses organisms in the trait table. (equivalent to removing all tips in the tree that don't map to the trait table) convert_to_bifurcating -- if True, ensure that the tree is fully bifurcating by resolving polytomies with very short branches. add_branch_length_to_root -- if True, ensure that the root node has a minimum branch length name_unnamed_nodes -- if True, name unnamed nodes in the tree. (Useful for ensuring internal nodes can be consistently identified in both the reference and pruned trees) remove_whitespace_from_labels -- if True, replace whitespace in organism labels with underscores replace_ambiguous_states -- if True, replace various strings representing ambiguous character states, as well as '-1' or -1 (used by IMG to represent a lack of data) with 0 values. replace_problematic_table_chars -- if True, replace ':' and ';' in the results with '_', and remove double quotes. (AncSR methods like ace can't handle these characters in organism labels) min_branch_length -- set the minimum branch length for all edges in the tree. This function combines the various reformatting functions in the library into a catch-all reformatter. TODO: This function is monolithic, so despite the individual parts being tested seperately, it probably needs to be broken down into several modular parts. This would need to be done with care however, as the order of steps matters quite a bit. """ input_tree = tree #Parse lines to fields once if trait_table_lines: if verbose: print "Parsing trait table...." header_line,trait_table_fields =\ parse_trait_table(trait_table_lines,delimiter = input_trait_table_delimiter) else: if verbose: print "Found no trait table lines. Setting data and header to empty" trait_table_fields = [] header_line = '' # Tree reformatting if convert_to_bifurcating: if verbose: print "Converting tree to bifurcating...." #maximum recursion depth on large trees #Try working around this issue with a large #recursion depth limit old_recursion_limit = getrecursionlimit() setrecursionlimit(50000) input_tree = input_tree.bifurcating( ) # Required by most ancSR programs setrecursionlimit(old_recursion_limit) #input_tree = ensure_root_is_bifurcating(input_tree) # The below nutty-looking re-filtering step is necessary # When ensuring the root is bifurcating, internal nodes can #get moved to the tips so without additional filtering we #get unannotated tip nodes #if filter_tree_by_table_entries: # input_tree = filter_tree_tips_by_presence_in_table(input_tree,\ # trait_table_fields,delimiter=input_trait_table_delimiter) #Name unnamed nodes if name_unnamed_nodes: if verbose: print "Naming unnamed nodes in the reference tree...." input_tree = make_internal_nodes_unique(input_tree) #input_tree.nameUnnamedNodes() check_node_labels(input_tree, verbose=verbose) #Paranoid check for missing names: #if verbose: # print "Checking that all nodes were named..." #for i,n in enumerate(input_tree.preorder()): # if n.Name is None: # raise ValueError('Node #%s (in tree.preorder()) was not named!'%str(i)) #map trait table ids to tree ids if trait_to_tree_mapping: #if verbose: # print "Validating that trait --> tree mappings match tree ids..." # good,bad = validate_trait_table_to_tree_mappings(input_tree,\ # trait_to_tree_mapping.values(), verbose = True) # print "Found %i valid ids." %(len(good)) # print "Found %i invalid ids." %(len(bad)) # #if bad: # # raise RuntimeError("The following putative tree ids in mapping file aren't actually in the input tree: %s" % bad) if verbose: print "Remapping trait table ids to match tree ids...." trait_table_fields =\ remap_trait_table_organisms(trait_table_fields,trait_to_tree_mapping,\ verbose = verbose) label_conversion_fns =\ set_label_conversion_fns(remove_whitespace_from_labels=remove_whitespace_from_labels,\ replace_problematic_label_characters=replace_problematic_label_characters) value_conversion_fns = set_value_conversion_fns(replace_ambiguous_states=replace_ambiguous_states,\ convert_trait_floats_to_ints=convert_trait_floats_to_ints) #Apply both label and value converters to the trait table trait_table_fields = convert_trait_table_entries(\ trait_table_fields,\ value_conversion_fns = value_conversion_fns,\ label_conversion_fns = label_conversion_fns) #We now need to apply any formatting functions to the tree nodes as well, to ensure #that names are consistent between the two. if label_conversion_fns: input_tree = fix_tree_labels(input_tree, label_conversion_fns) #Then filter the trait table to include only tree tips if filter_table_by_tree_tips: if verbose: print "Filtering trait table ids to include only those that match tree ids...." trait_table_fields = filter_table_by_presence_in_tree(input_tree,\ trait_table_fields,delimiter=input_trait_table_delimiter) #if verbose: # print "Verifying that new trait table ids match tree:" # print "# of trait_table_lines: %i" %len(trait_table_lines) # all_tip_ids = [tip.Name for tip in input_tree.iterTips()] # print "example tree tip ids:",all_tip_ids[0:10] if filter_tree_by_table_entries: if verbose: print "filtering tree tips to match entries in trait table...." input_tree = filter_tree_tips_by_presence_in_table(input_tree,\ trait_table_fields,delimiter=input_trait_table_delimiter,\ verbose=verbose) if min_branch_length: if verbose: print "Setting a min branch length of %f throughout tree...." \ % min_branch_length input_tree = set_min_branch_length(input_tree, min_length=min_branch_length) if add_branch_length_to_root: if vebose: print "Adding a min branch length of %f to the root node...." \ % min_branch_length input_tree = add_branch_length_to_root(input_tree,root_name=input_tree.Name,\ root_length=min_branch_length) if verbose: print "Performing a final round of tree pruning to remove internal nodes with only one child...." input_tree.prune() #Format resulting trait table lines result_trait_table_lines = [header_line] result_trait_table_lines.extend( [output_trait_table_delimiter.join(f) for f in trait_table_fields]) if verbose: print "Final reprocessing of trait table lines to remove trailing whitespace..." result_trait_table_lines =\ [line.strip() for line in result_trait_table_lines if line.strip()] if verbose: print "Done reformatting tree and trait table" return input_tree, result_trait_table_lines