Example #1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        option_parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)

    if h5py.is_hdf5(opts.input_fp):
        # metadata are not deserializing correctly. Duct tape it.
        update_d = {}
        for i, md in zip(table.ids(axis='observation'),
                         table.metadata(axis='observation')):
            update_d[i] = {k: json.loads(v[0]) for k, v in md.items()}
        table.add_metadata(update_d, axis='observation')

    result = table.collapse(collapse_f,
                            axis='observation',
                            one_to_many=True,
                            norm=False,
                            one_to_many_md_key=opts.metadata_category)

    if (opts.format_tab_delimited):
        f = open(opts.output_fp, 'w')
        f.write(
            result.to_tsv(header_key=opts.metadata_category,
                          header_value=opts.metadata_category,
                          metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        option_parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)

    if h5py.is_hdf5(opts.input_fp):
        # metadata are not deserializing correctly. Duct tape it.
        update_d = {}
        for i, md in zip(table.ids(axis='observation'),
                         table.metadata(axis='observation')):
            update_d[i] = {k: json.loads(v[0]) for k, v in md.items()}
        table.add_metadata(update_d, axis='observation')

    result = table.collapse(collapse_f, axis='observation', one_to_many=True,
                            norm=False,
                            one_to_many_md_key=opts.metadata_category)

    if(opts.format_tab_delimited):
        f = open(opts.output_fp, 'w')
        f.write(result.to_tsv(header_key=opts.metadata_category,
                              header_value=opts.metadata_category,
                              metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
Example #3
0
def write_metagenome_to_file(
    predicted_metagenome,
    output_fp,
    tab_delimited=False,
    verbose_filetype_message="metagenome prediction",
    verbose=False,
):
    """Write a BIOM Table object to a file, creating the directory if needed
    predicted_metagenome -- a BIOM table object
    output_fp -- the filepath to write the output
    tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file
    verbose -- if True output verbose info to StdOut
    """

    if verbose:
        print "Writing %s results to output file: %s" % (verbose_filetype_message, output_fp)

    make_output_dir_for_file(output_fp)
    if tab_delimited:
        # peek at first observation to decide on what observeration metadata
        # to output in tab-delimited format
        (obs_val, obs_id, obs_metadata) = predicted_metagenome.iter(axis="observation").next()

        # see if there is a metadata field that contains the "Description"
        # (e.g. KEGG_Description or COG_Description)
        h = re.compile(".*Description")
        metadata_names = filter(h.search, obs_metadata.keys())
        if metadata_names:
            # use the "Description" field we found
            metadata_name = metadata_names[0]
        elif obs_metadata.keys():
            # if no "Description" metadata then just output the first
            # observation metadata
            metadata_name = (obs_metadata.keys())[0]
        else:
            # if no observation metadata then don't output any
            metadata_name = None

        open(output_fp, "w").write(
            predicted_metagenome.to_tsv(
                header_key=metadata_name, header_value=metadata_name, metadata_formatter=biom_meta_to_string
            )
        )
    else:
        # output in BIOM format
        format_fs = {
            "KEGG_Description": picrust_formatter,
            "COG_Description": picrust_formatter,
            "KEGG_Pathways": picrust_formatter,
            "COG_Category": picrust_formatter,
        }
        write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
Example #4
0
def write_metagenome_to_file(predicted_metagenome,output_fp,\
    tab_delimited=False,verbose_filetype_message="metagenome prediction",\
    verbose=False):
    """Write a BIOM Table object to a file, creating the directory if needed
    predicted_metagenome -- a BIOM table object
    output_fp -- the filepath to write the output
    tab_delimited -- if False, write in BIOm format, otherwise write as a tab-delimited file
    verbose -- if True output verbose info to StdOut
    """

    if verbose:
        print "Writing %s results to output file: %s"\
          %(verbose_filetype_message,output_fp)

    make_output_dir_for_file(output_fp)
    if tab_delimited:
        #peek at first observation to decide on what observeration metadata
        #to output in tab-delimited format
        (obs_val,obs_id,obs_metadata)=\
          predicted_metagenome.iter(axis='observation').next()

        #see if there is a metadata field that contains the "Description"
        #(e.g. KEGG_Description or COG_Description)
        h = re.compile('.*Description')
        metadata_names = filter(h.search, obs_metadata.keys())
        if metadata_names:
            #use the "Description" field we found
            metadata_name = metadata_names[0]
        elif (obs_metadata.keys()):
            #if no "Description" metadata then just output the first
            #observation metadata
            metadata_name = (obs_metadata.keys())[0]
        else:
            #if no observation metadata then don't output any
            metadata_name = None

        open(output_fp,'w').write(predicted_metagenome.to_tsv(\
          header_key=metadata_name,header_value=metadata_name,metadata_formatter=biom_meta_to_string))
    else:
        #output in BIOM format
        format_fs = {
            'KEGG_Description': picrust_formatter,
            'COG_Description': picrust_formatter,
            'KEGG_Pathways': picrust_formatter,
            'COG_Category': picrust_formatter
        }
        write_biom_table(predicted_metagenome, output_fp, format_fs=format_fs)
Example #5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ", opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id, depth in parse_seq_count_file(open(opts.input_seq_depth_file, "U")):
        scaling_factors[sample_id] = depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table, scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ", opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)
    result = table.collapse(collapse_f, axis='observation', one_to_many=True,
                          norm=False,one_to_many_md_key=opts.metadata_category)


    if(opts.format_tab_delimited):
        f = open(opts.output_fp,'w')
        f.write(result.to_tsv(header_key=opts.metadata_category,
                              header_value=opts.metadata_category,
                              metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
Example #7
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ",opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')):
        scaling_factors[sample_id]=depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
Example #8
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics = True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    # Load Tree
    #tree = LoadTree(opts.tree)
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers = []
    traits = {}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:", opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." % (
                    len(asr_max_vals))
                print "Brownian motion parameter:", brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)

    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits, tree, trait_label=trait_label)

    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

            if opts.verbose:
                print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
            brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                     upper_bound_trait_label="upper_bound",\
                     lower_bound_trait_label="lower_bound",\
                     trait_label=trait_label,\
                     confidence=0.95)
            if opts.verbose:
                print "Inferred the following rate parameters:", brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True, True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)

        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table, "U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" % (
                [",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {
                    'NSTI': min_distances[organism]
                }

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:", opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only, 'w+')
            f.write("metric\torganism\tvalue\n")
            lines = []
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()

    if opts.verbose:
        print "Generating predictions using method:", opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances = None  #Overwritten by methods that calc variance
    confidence_intervals = None  #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn =weight_fn,verbose=opts.verbose)

        else:
            predictions =\
             predict_traits_from_ancestors(tree,nodes_to_predict,\
             trait_label=trait_label,\
             weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh = open(opts.output_trait_table, 'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ", opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix = '.biom'
        else:
            suffix = '.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base, extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base + "_variances" + suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:", variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))

    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base + "_upper_CI" + suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:", upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base + "_lower_CI" + suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file", lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
Example #9
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics=True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    if opts.no_round:
        round_opt = False 
    else:
        round_opt = True

    # Load Tree
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers=[]
    traits={}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:",opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." %(len(asr_max_vals))
                print "Brownian motion parameter:",brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)


    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits,tree, trait_label=trait_label)


    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

             if opts.verbose:
                 print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
             brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                      upper_bound_trait_label="upper_bound",\
                      lower_bound_trait_label="lower_bound",\
                      trait_label=trait_label,\
                      confidence=0.95)
             if opts.verbose:
                 print "Inferred the following rate parameters:",brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True,True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)


        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table,"U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {'NSTI': min_distances[organism]}

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:",opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only,'w+')
            f.write("metric\torganism\tvalue\n")
            lines =[]
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()


    if opts.verbose:
        print "Generating predictions using method:",opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances=None #Overwritten by methods that calc variance
    confidence_intervals=None #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn=weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

        else:
             predictions =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              weight_fn =weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)



    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh=open(opts.output_trait_table,'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ",opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix='.biom'
        else:
            suffix='.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base,extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base+"_variances"+suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:",variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))


    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base+"_upper_CI"+suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:",upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base+"_lower_CI"+suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file",lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Example #11
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table,"U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))

    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
       print "Ensuring tree and trait table labels are formatted consistently..."

    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)

    fix_tree_labels(tree,label_conversion_fns)

    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)


    if opts.limit_to_tips:

        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]

    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False

    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)

    if opts.verbose:
        print "Writing files for test  datasets"

    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:

        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips)
                continue


        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'"%tip_to_predict

        #Write tree
        base_name = "--".join(map(str,["test_tree",opts.method,curr_dist]))
        curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath

        #Write expected trait table
        base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict]))

        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits)+"\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename

        f=open(filename,"w")
        f.write("".join(exp_trait_table_lines))
        f.close()

        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename,"U"))
        fields = [f for f in fields] #converts generator to list

        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")

        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)

        #Write BIOM format expected trait table
        base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict]))

        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")

        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename

        write_biom_table(expected_biom_table, filename)


        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields])

        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict]))
        filename=os.path.join(opts.output_dir,base_name)

        if opts.verbose:
            print "Writing test trait table to:", filename

        f=open(filename,"w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
Example #12
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table,"U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))

    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
       print "Ensuring tree and trait table labels are formatted consistently..."

    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)

    fix_tree_labels(tree,label_conversion_fns)

    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)


    if opts.limit_to_tips:

        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]

    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False

    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)

    if opts.verbose:
        print "Writing files for test  datasets"

    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:

        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips)
                continue


        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'"%tip_to_predict

        #Write tree
        base_name = "--".join(map(str,["test_tree",opts.method,curr_dist]))
        curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath

        #Write expected trait table
        base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict]))

        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits)+"\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename

        f=open(filename,"w")
        f.write("".join(exp_trait_table_lines))
        f.close()

        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename,"U"))
        fields = [f for f in fields] #converts generator to list

        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")

        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)

        #Write BIOM format expected trait table
        base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict]))

        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")

        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename

        write_biom_table(expected_biom_table, filename)


        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields])

        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict]))
        filename=os.path.join(opts.output_dir,base_name)

        if opts.verbose:
            print "Writing test trait table to:", filename

        f=open(filename,"w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis="observation")

    if opts.input_count_fp is None:
        # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"])
        input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if ext == ".gz":
        count_table_fh = gzip.open(input_count_table, "rb")
    else:
        count_table_fh = open(input_count_table, "U")

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    # Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iter(axis="observation"):
        ids.append(str(x[1]))

    ob_id = count_table.ids(axis="observation")[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.exists(x, axis="sample"):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis="observation"))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id, x)
        try:
            # data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation")

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])

    normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation")

    # move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation")

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)