def load_tab_delimited_trait_table(trait_table_fp, verbose=False):
    """Load a tab delimited trait table for picrust"""
    input_trait_table = open(trait_table_fp, "U")
    if verbose:
        print "Parsing trait table..."
    # Find which taxa are to be used in tests
    # (by default trait table taxa)
    trait_table_header, trait_table_fields = parse_trait_table(input_trait_table)

    label_conversion_fns = set_label_conversion_fns(verbose=verbose)
    trait_table_fields = convert_trait_table_entries(
        trait_table_fields, value_conversion_fns=[], label_conversion_fns=label_conversion_fns
    )

    trait_table_fields = [t for t in trait_table_fields]

    if verbose:
        print "Number of trait table fields with single quotes:", len([t for t in trait_table_fields if "'" in t[0]])

    return trait_table_header, trait_table_fields
def load_tab_delimited_trait_table(trait_table_fp, verbose=False):
    """Load a tab delimited trait table for picrust"""
    input_trait_table = open(trait_table_fp, "U")
    if verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    label_conversion_fns = set_label_conversion_fns(verbose=verbose)
    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]

    if verbose:
        print "Number of trait table fields with single quotes:",\
          len([t for t in trait_table_fields if "'" in t[0]])

    return trait_table_header, trait_table_fields
def reformat_tree_and_trait_table(tree,trait_table_lines,trait_to_tree_mapping,\
    input_trait_table_delimiter="\t", output_trait_table_delimiter="\t",\
    filter_table_by_tree_tips=True, convert_trait_floats_to_ints=False,\
    filter_tree_by_table_entries=True,convert_to_bifurcating=False,\
    add_branch_length_to_root=False, name_unnamed_nodes=True,\
    remove_whitespace_from_labels = True,replace_ambiguous_states=True,\
    replace_problematic_label_characters = True,min_branch_length=0.0001,\
    verbose=True):
    """Return a full reformatted tree,pruned reformatted tree  and set of trait table lines

    tree - a PyCogent PhyloNode tree object

    trait_table_lines -- the lines of a trait table, where
      the rows are organisms and the columns are traits (e.g. gene counts).

    trait_id_to_tree_mapping -- a dict keyed by trait table ids, with
      values of tree ids.   If provided, trait table ids will be mapped to
      tree ids

    filter_table_by_tree_tips -- if True, remove trait table rows that don't map to ids on the
    tree

    convert_trait_floats_to_ints -- if True, convert floating point values in trait table cells to integers.

    filter_tree_by_table_entries -- if True, save only the subtree that encompasses organisms in the trait table.
    (equivalent to removing all tips in the tree that don't map to the trait table)

    convert_to_bifurcating -- if True, ensure that the tree is fully bifurcating by resolving polytomies with very short
    branches.

    add_branch_length_to_root -- if True, ensure that the root node has a minimum branch length

    name_unnamed_nodes -- if True, name unnamed nodes in the tree.   (Useful for ensuring internal nodes can be
    consistently identified in both the reference and pruned trees)

    remove_whitespace_from_labels -- if True, replace whitespace in organism labels with underscores

    replace_ambiguous_states -- if True, replace various strings representing ambiguous character states,
    as well as '-1' or -1 (used by IMG to represent a lack of data) with 0 values.

    replace_problematic_table_chars -- if True, replace ':' and ';' in the results with '_', and remove double quotes.
    (AncSR methods like ace can't handle these characters in organism labels)

    min_branch_length -- set the minimum branch length for all edges in the tree.

    This function combines the various reformatting functions in the
    library into a catch-all reformatter.

    TODO: This function is monolithic, so despite the individual
    parts being tested seperately, it probably needs to be broken
    down into several modular parts.  This would need to be done
    with care however, as the order of steps matters quite a bit.


    """



    input_tree = tree

    #Parse lines to fields once


    if trait_table_lines:
        if verbose:
            print "Parsing trait table...."
        header_line,trait_table_fields =\
          parse_trait_table(trait_table_lines,delimiter = input_trait_table_delimiter)
    else:
        if verbose:
            print "Found no trait table lines. Setting data and header to empty"
        trait_table_fields = []
        header_line = ''

    # Tree reformatting
    if convert_to_bifurcating:
        if verbose:
            print "Converting tree to bifurcating...."

        #maximum recursion depth on large trees
        #Try working around this issue with a large
        #recursion depth limit
        old_recursion_limit = getrecursionlimit()
        setrecursionlimit(50000)
        input_tree = input_tree.bifurcating() # Required by most ancSR programs
        setrecursionlimit(old_recursion_limit)

        #input_tree = ensure_root_is_bifurcating(input_tree)

        # The below nutty-looking re-filtering step is necessary
        # When ensuring the root is bifurcating, internal nodes can
        #get moved to the tips so without additional filtering we
        #get unannotated tip nodes

        #if filter_tree_by_table_entries:
        #    input_tree = filter_tree_tips_by_presence_in_table(input_tree,\
        #      trait_table_fields,delimiter=input_trait_table_delimiter)



    #Name unnamed nodes
    if name_unnamed_nodes:
        if verbose:
            print "Naming unnamed nodes in the reference tree...."
        input_tree=make_internal_nodes_unique(input_tree)
        #input_tree.nameUnnamedNodes()
        check_node_labels(input_tree,verbose=verbose)
        #Paranoid check for missing names:
        #if verbose:
        #    print "Checking that all nodes were named..."
        #for i,n in enumerate(input_tree.preorder()):
        #    if n.Name is None:
        #        raise ValueError('Node #%s (in tree.preorder()) was not named!'%str(i))


    #map trait table ids to tree ids
    if trait_to_tree_mapping:
        #if verbose:
        #    print "Validating that trait --> tree mappings match tree ids..."
        #    good,bad = validate_trait_table_to_tree_mappings(input_tree,\
        #      trait_to_tree_mapping.values(), verbose = True)
        #    print "Found %i valid ids." %(len(good))
        #    print "Found %i invalid ids." %(len(bad))
        #    #if bad:
        #    #    raise RuntimeError("The following putative tree ids in mapping file aren't actually in the input tree: %s" % bad)


        if verbose:
            print "Remapping trait table ids to match tree ids...."

        trait_table_fields =\
          remap_trait_table_organisms(trait_table_fields,trait_to_tree_mapping,\
          verbose = verbose)

    label_conversion_fns =\
      set_label_conversion_fns(remove_whitespace_from_labels=remove_whitespace_from_labels,\
        replace_problematic_label_characters=replace_problematic_label_characters)

    value_conversion_fns = set_value_conversion_fns(replace_ambiguous_states=replace_ambiguous_states,\
      convert_trait_floats_to_ints=convert_trait_floats_to_ints)


    #Apply both label and value converters to the trait table
    trait_table_fields = convert_trait_table_entries(\
      trait_table_fields,\
      value_conversion_fns = value_conversion_fns,\
      label_conversion_fns = label_conversion_fns)


    #We now need to apply any formatting functions to the tree nodes as well, to ensure
    #that names are consistent between the two.

    if label_conversion_fns:
        input_tree = fix_tree_labels(input_tree, label_conversion_fns)

    #Then filter the trait table to include only tree tips
    if filter_table_by_tree_tips:
        if verbose:
            print "Filtering trait table ids to include only those that match tree ids...."
        trait_table_fields = filter_table_by_presence_in_tree(input_tree,\
          trait_table_fields,delimiter=input_trait_table_delimiter)

        #if verbose:
        #    print "Verifying that new trait table ids match tree:"
        #    print "# of trait_table_lines: %i" %len(trait_table_lines)
        #    all_tip_ids = [tip.Name for tip in input_tree.iterTips()]
        #    print "example tree tip ids:",all_tip_ids[0:10]
    if filter_tree_by_table_entries:
        if verbose:
            print "filtering tree tips to match entries in trait table...."
        input_tree = filter_tree_tips_by_presence_in_table(input_tree,\
          trait_table_fields,delimiter=input_trait_table_delimiter,\
          verbose=verbose)

    if min_branch_length:
        if verbose:
            print "Setting a min branch length of %f throughout tree...." \
              % min_branch_length
        input_tree = set_min_branch_length(input_tree,min_length = min_branch_length)

    if add_branch_length_to_root:
        if vebose:
            print "Adding a min branch length of %f to the root node...." \
              % min_branch_length
        input_tree = add_branch_length_to_root(input_tree,root_name=input_tree.Name,\
          root_length=min_branch_length)
    if verbose:
        print "Performing a final round of tree pruning to remove internal nodes with only one child...."

    input_tree.prune()




    #Format resulting trait table lines
    result_trait_table_lines = [header_line]
    result_trait_table_lines.extend([output_trait_table_delimiter.join(f) for f in trait_table_fields])

    if verbose:
        print "Final reprocessing of trait table lines to remove trailing whitespace..."
    result_trait_table_lines =\
      [line.strip() for line in result_trait_table_lines if line.strip()]



    if verbose:
        print "Done reformatting tree and trait table"


    return input_tree, result_trait_table_lines
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir,strict=False)
    output_table_fp = join(output_dir,trait_table_base)
    output_tree_fp = join(output_dir,pruned_tree_base)
    output_reference_tree_fp = join(output_dir,reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space":" ","tab":"\t","comma":","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" %("Tree file",tree_file)
        print "\t%s:%s" %("Trait table",trait_table_fp)
        print "\t%s:%s" %("Output tree",output_tree_fp)
        print "\t%s:%s" %("Output reference tree",output_reference_tree_fp)
        print "\t%s:%s" %("Output trait table",output_table_fp)
        print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root)
        print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus)
        print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter)
        print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp,"U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping,"U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy=new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)



    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp,"U")
        otu_table_lines = otu_table.readlines()
        header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)


        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)


    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp,"w+")
    output_tree_file  = open(output_tree_fp,"w+")
    output_reference_tree_file  = open(output_reference_tree_fp,"w+")


    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str,lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()


    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
Example #5
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table,"U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))
   
    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests 
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
       print "Ensuring tree and trait table labels are formatted consistently..."
   
    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)
    
    fix_tree_labels(tree,label_conversion_fns)
    
    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    
    if opts.limit_to_tips:
        
        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]
     
    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False
    
    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)
    
    if opts.verbose:
        print "Writing files for test  datasets"
    
    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:    
        
        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips)
                continue


        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'"%tip_to_predict

        #Write tree
        base_name = "--".join(map(str,["test_tree",opts.method,curr_dist]))
        curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath
        
        #Write expected trait table
        base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict]))
                
        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits)+"\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(exp_trait_table_lines))
        f.close()
        
        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence 
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename,"U"))
        fields = [f for f in fields] #converts generator to list    
        
        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")
       
        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)
        
        #Write BIOM format expected trait table
        base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict]))
        
        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")
                
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename
        
        f=open(filename,"w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

       
        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields])
        
        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict]))
        filename=os.path.join(opts.output_dir,base_name)
        
        if opts.verbose:
            print "Writing test trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
Example #6
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table, "U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))

    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
        print "Ensuring tree and trait table labels are formatted consistently..."

    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)

    fix_tree_labels(tree, label_conversion_fns)

    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    if opts.limit_to_tips:

        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" % (
                len(included_tips), included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]

    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False

    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)

    if opts.verbose:
        print "Writing files for test  datasets"

    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:

        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" % (
                        tip_to_predict, included_tips)
                continue

        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'" % tip_to_predict

        #Write tree
        base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist]))
        curr_filepath = write_tree(opts.output_dir, base_name, test_tree,
                                   safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath

        #Write expected trait table
        base_name = "--".join(
            map(str,
                ["exp_traits", opts.method, curr_dist, safe_tip_to_predict]))

        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits) + "\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename

        f = open(filename, "w")
        f.write("".join(exp_trait_table_lines))
        f.close()

        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename, "U"))
        fields = [f for f in fields]  #converts generator to list

        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")

        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(
            ["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)

        #Write BIOM format expected trait table
        base_name = "--".join(
            map(str, [
                "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict
            ]))

        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")

        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename

        f = open(filename, "w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(
            ["\t".join(r) + "\n" for r in test_trait_table_fields])

        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(
            map(str, [
                "test_trait_table", opts.method, curr_dist, safe_tip_to_predict
            ]))
        filename = os.path.join(opts.output_dir, base_name)

        if opts.verbose:
            print "Writing test trait table to:", filename

        f = open(filename, "w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir, strict=False)
    output_table_fp = join(output_dir, trait_table_base)
    output_tree_fp = join(output_dir, pruned_tree_base)
    output_reference_tree_fp = join(output_dir, reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space": " ", "tab": "\t", "comma": ","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" % ("Tree file", tree_file)
        print "\t%s:%s" % ("Trait table", trait_table_fp)
        print "\t%s:%s" % ("Output tree", output_tree_fp)
        print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp)
        print "\t%s:%s" % ("Output trait table", output_table_fp)
        print "\t%s:%s" % ("Add branch length to root",
                           opts.add_branch_length_to_root)
        print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus)
        print "\t%s:%s" % ("Input trait table delimiter",
                           opts.input_table_delimiter)
        print "\t%s:%s" % ("Output trait table delimiter",
                           opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"
    #format_for_bayestraits = True
    #TODO: this will become a new function in the bayestraits app controller
    #if format_for_bayestraits:
    #    convert_to_nexus = True
    #    convert_to_bifurcating = True
    #    filter_table_by_tree_tips = True
    #    filter_tree_by_table_entries = True
    #    enforce_min_branch_length = True
    #    convert_trait_floats_to_ints = True

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))
    #input_tree =DndParser(open(tree_file), constructor=PicrustNode)

    #input_tree = load_picrust_tree(opts.input_tree,opts.verbose)

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp, "U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError(
            "No lines could be loaded from file %s. Please check the input file."
            % trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping, "U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy = new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)

    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp, "U")
        otu_table_lines = otu_table.readlines()
        header_line, otu_table_fields = parse_trait_table(
            otu_table_lines, delimiter=input_delimiter, has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)

        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(
            new_reference_tree_copy, tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)

    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp, "w+")
    output_tree_file = open(output_tree_fp, "w+")
    output_reference_tree_file = open(output_reference_tree_fp, "w+")

    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str, lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()

    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(
        new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
Example #8
0
def reformat_tree_and_trait_table(tree,trait_table_lines,trait_to_tree_mapping,\
    input_trait_table_delimiter="\t", output_trait_table_delimiter="\t",\
    filter_table_by_tree_tips=True, convert_trait_floats_to_ints=False,\
    filter_tree_by_table_entries=True,convert_to_bifurcating=False,\
    add_branch_length_to_root=False, name_unnamed_nodes=True,\
    remove_whitespace_from_labels = True,replace_ambiguous_states=True,\
    replace_problematic_label_characters = True,min_branch_length=0.0001,\
    verbose=True):
    """Return a full reformatted tree,pruned reformatted tree  and set of trait table lines 

    tree - a PyCogent PhyloNode tree object
    
    trait_table_lines -- the lines of a trait table, where 
      the rows are organisms and the columns are traits (e.g. gene counts).
    
    trait_id_to_tree_mapping -- a dict keyed by trait table ids, with
      values of tree ids.   If provided, trait table ids will be mapped to 
      tree ids

    filter_table_by_tree_tips -- if True, remove trait table rows that don't map to ids on the 
    tree 

    convert_trait_floats_to_ints -- if True, convert floating point values in trait table cells to integers.

    filter_tree_by_table_entries -- if True, save only the subtree that encompasses organisms in the trait table.
    (equivalent to removing all tips in the tree that don't map to the trait table)

    convert_to_bifurcating -- if True, ensure that the tree is fully bifurcating by resolving polytomies with very short
    branches.

    add_branch_length_to_root -- if True, ensure that the root node has a minimum branch length

    name_unnamed_nodes -- if True, name unnamed nodes in the tree.   (Useful for ensuring internal nodes can be 
    consistently identified in both the reference and pruned trees)

    remove_whitespace_from_labels -- if True, replace whitespace in organism labels with underscores
   
    replace_ambiguous_states -- if True, replace various strings representing ambiguous character states,
    as well as '-1' or -1 (used by IMG to represent a lack of data) with 0 values.

    replace_problematic_table_chars -- if True, replace ':' and ';' in the results with '_', and remove double quotes.
    (AncSR methods like ace can't handle these characters in organism labels)

    min_branch_length -- set the minimum branch length for all edges in the tree.   
    
    This function combines the various reformatting functions in the 
    library into a catch-all reformatter.  
    
    TODO: This function is monolithic, so despite the individual
    parts being tested seperately, it probably needs to be broken
    down into several modular parts.  This would need to be done
    with care however, as the order of steps matters quite a bit.
    

    """

    input_tree = tree

    #Parse lines to fields once

    if trait_table_lines:
        if verbose:
            print "Parsing trait table...."
        header_line,trait_table_fields =\
          parse_trait_table(trait_table_lines,delimiter = input_trait_table_delimiter)
    else:
        if verbose:
            print "Found no trait table lines. Setting data and header to empty"
        trait_table_fields = []
        header_line = ''

    # Tree reformatting
    if convert_to_bifurcating:
        if verbose:
            print "Converting tree to bifurcating...."

        #maximum recursion depth on large trees
        #Try working around this issue with a large
        #recursion depth limit
        old_recursion_limit = getrecursionlimit()
        setrecursionlimit(50000)
        input_tree = input_tree.bifurcating(
        )  # Required by most ancSR programs
        setrecursionlimit(old_recursion_limit)

        #input_tree = ensure_root_is_bifurcating(input_tree)

        # The below nutty-looking re-filtering step is necessary
        # When ensuring the root is bifurcating, internal nodes can
        #get moved to the tips so without additional filtering we
        #get unannotated tip nodes

        #if filter_tree_by_table_entries:
        #    input_tree = filter_tree_tips_by_presence_in_table(input_tree,\
        #      trait_table_fields,delimiter=input_trait_table_delimiter)

    #Name unnamed nodes
    if name_unnamed_nodes:
        if verbose:
            print "Naming unnamed nodes in the reference tree...."
        input_tree = make_internal_nodes_unique(input_tree)
        #input_tree.nameUnnamedNodes()
        check_node_labels(input_tree, verbose=verbose)
        #Paranoid check for missing names:
        #if verbose:
        #    print "Checking that all nodes were named..."
        #for i,n in enumerate(input_tree.preorder()):
        #    if n.Name is None:
        #        raise ValueError('Node #%s (in tree.preorder()) was not named!'%str(i))

    #map trait table ids to tree ids
    if trait_to_tree_mapping:
        #if verbose:
        #    print "Validating that trait --> tree mappings match tree ids..."
        #    good,bad = validate_trait_table_to_tree_mappings(input_tree,\
        #      trait_to_tree_mapping.values(), verbose = True)
        #    print "Found %i valid ids." %(len(good))
        #    print "Found %i invalid ids." %(len(bad))
        #    #if bad:
        #    #    raise RuntimeError("The following putative tree ids in mapping file aren't actually in the input tree: %s" % bad)

        if verbose:
            print "Remapping trait table ids to match tree ids...."

        trait_table_fields =\
          remap_trait_table_organisms(trait_table_fields,trait_to_tree_mapping,\
          verbose = verbose)

    label_conversion_fns =\
      set_label_conversion_fns(remove_whitespace_from_labels=remove_whitespace_from_labels,\
        replace_problematic_label_characters=replace_problematic_label_characters)

    value_conversion_fns = set_value_conversion_fns(replace_ambiguous_states=replace_ambiguous_states,\
      convert_trait_floats_to_ints=convert_trait_floats_to_ints)

    #Apply both label and value converters to the trait table
    trait_table_fields = convert_trait_table_entries(\
      trait_table_fields,\
      value_conversion_fns = value_conversion_fns,\
      label_conversion_fns = label_conversion_fns)

    #We now need to apply any formatting functions to the tree nodes as well, to ensure
    #that names are consistent between the two.

    if label_conversion_fns:
        input_tree = fix_tree_labels(input_tree, label_conversion_fns)

    #Then filter the trait table to include only tree tips
    if filter_table_by_tree_tips:
        if verbose:
            print "Filtering trait table ids to include only those that match tree ids...."
        trait_table_fields = filter_table_by_presence_in_tree(input_tree,\
          trait_table_fields,delimiter=input_trait_table_delimiter)

        #if verbose:
        #    print "Verifying that new trait table ids match tree:"
        #    print "# of trait_table_lines: %i" %len(trait_table_lines)
        #    all_tip_ids = [tip.Name for tip in input_tree.iterTips()]
        #    print "example tree tip ids:",all_tip_ids[0:10]
    if filter_tree_by_table_entries:
        if verbose:
            print "filtering tree tips to match entries in trait table...."
        input_tree = filter_tree_tips_by_presence_in_table(input_tree,\
          trait_table_fields,delimiter=input_trait_table_delimiter,\
          verbose=verbose)

    if min_branch_length:
        if verbose:
            print "Setting a min branch length of %f throughout tree...." \
              % min_branch_length
        input_tree = set_min_branch_length(input_tree,
                                           min_length=min_branch_length)

    if add_branch_length_to_root:
        if vebose:
            print "Adding a min branch length of %f to the root node...." \
              % min_branch_length
        input_tree = add_branch_length_to_root(input_tree,root_name=input_tree.Name,\
          root_length=min_branch_length)
    if verbose:
        print "Performing a final round of tree pruning to remove internal nodes with only one child...."

    input_tree.prune()

    #Format resulting trait table lines
    result_trait_table_lines = [header_line]
    result_trait_table_lines.extend(
        [output_trait_table_delimiter.join(f) for f in trait_table_fields])

    if verbose:
        print "Final reprocessing of trait table lines to remove trailing whitespace..."
    result_trait_table_lines =\
      [line.strip() for line in result_trait_table_lines if line.strip()]

    if verbose:
        print "Done reformatting tree and trait table"

    return input_tree, result_trait_table_lines