Example #1
0
def load_data_table(
    data_table_fp,
    load_data_table_in_biom=False,
    suppress_subset_loading=False,
    ids_to_load=None,
    transpose=False,
    verbose=False,
):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table

    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    if not path.exists(data_table_fp):
        raise IOError("File " + data_table_fp + " doesn't exist! Did you forget to download it?")

    ext = path.splitext(data_table_fp)[1]
    if ext == ".gz":
        genome_table_fh = gzip.open(data_table_fp, "rb")
    else:
        genome_table_fh = open(data_table_fp, "U")

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            # Now we want to use the OTU table information
            # to load only rows in the count table corresponding
            # to relevant OTUs

            if verbose:
                print "Loading traits for %i organisms from the trait table" % len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(), ids_to_load, axis="samples")
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(data_table_fp)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh, ids_to_load, transpose=transpose)

    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." % (
            len(genome_table.ids(axis="observation")),
            len(genome_table.ids()),
        )

    return genome_table
Example #2
0
def load_data_table(data_table_fp,\
  load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\
  transpose=False,verbose=False):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table

    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    if not path.exists(data_table_fp):
        raise IOError("File " + data_table_fp +
                      " doesn't exist! Did you forget to download it?")

    ext = path.splitext(data_table_fp)[1]
    if (ext == '.gz'):
        genome_table_fh = gzip.open(data_table_fp, 'rb')
    else:
        genome_table_fh = open(data_table_fp, 'U')

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if verbose:
                print "Loading traits for %i organisms from the trait table" % len(
                    ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),
                                                     ids_to_load,
                                                     axis='samples')
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(data_table_fp)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,
                                               ids_to_load,
                                               transpose=transpose)

    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." % (
            len(genome_table.ids(axis='observation')), len(genome_table.ids()))

    return genome_table
Example #3
0
def load_data_table(data_table_fp,\
  load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\
  transpose=False,verbose=False):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table
    
    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    ext=path.splitext(data_table_fp)[1]
    if (ext == '.gz'):
        genome_table_fh = gzip.open(data_table_fp,'rb')
    else:
        genome_table_fh = open(data_table_fp,'U')

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load,transpose=transpose)
    
    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    return genome_table
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    if(opts.input_count_table is None):
        if(opts.type_of_prediction == 'KO'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','ko_precalculated.biom.gz')
        elif(opts.type_of_prediction == 'COG'):
            input_count_table=join(get_picrust_project_dir(),'picrust','data','cog_precalculated.biom.gz')
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    
    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        genome_table_str = gzip.open(input_count_table,'rb').read()
    else:
        genome_table_str = open(input_count_table,'U').read()
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations
    
    if not opts.suppress_subset_loading:
        #Now we want to use the OTU table information
        #to load only rows in the count table corresponding
        #to relevant OTUs
        ids_to_load = otu_table.ObservationIds

        if opts.verbose:
            print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

        genome_table = load_subset_from_biom_str(genome_table_str,ids_to_load,axis='samples')
    else:
        if opts.verbose:
            print "Loading *full* trait table because --suppress_subset_loading was passed. This may result in high memory usage."
        genome_table = parse_biom_table(genome_table_str)
    
    if opts.verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '|'.join(['; '.join(l) for l in s])))
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))