コード例 #1
0
ファイル: test_util.py プロジェクト: zaneveld/picrust
    def test_convert_precalc_to_biom(self):
        """ convert_precalc_to_biom as expected with valid input """
        # test if table conversion works (and the ablity to handle a string as input)
        result_table = convert_precalc_to_biom(precalc_in_tab)
        self.assertEqual(result_table,self.precalc_in_biom)

        # test partial loading (and the ability to handle a file handle as input)
        ids_to_load = ['OTU_1','OTU_2']
        two_taxon_table = convert_precalc_to_biom(StringIO.StringIO(precalc_in_tab),ids_to_load)
        self.assertEqualItems(two_taxon_table.ids(), ids_to_load)
コード例 #2
0
    def test_convert_precalc_to_biom(self):
        """ convert_precalc_to_biom as expected with valid input """
        # test if table conversion works (and the ablity to handle a string as input)
        result_table = convert_precalc_to_biom(precalc_in_tab)
        self.assertEqual(result_table, self.precalc_in_biom)

        # test partial loading (and the ability to handle a file handle as input)
        ids_to_load = ['OTU_1', 'OTU_2']
        two_taxon_table = convert_precalc_to_biom(
            StringIO.StringIO(precalc_in_tab), ids_to_load)
        self.assertEqualItems(two_taxon_table.ids(), ids_to_load)
コード例 #3
0
def load_data_table(
    data_table_fp,
    load_data_table_in_biom=False,
    suppress_subset_loading=False,
    ids_to_load=None,
    transpose=False,
    verbose=False,
):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table

    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    if not path.exists(data_table_fp):
        raise IOError("File " + data_table_fp + " doesn't exist! Did you forget to download it?")

    ext = path.splitext(data_table_fp)[1]
    if ext == ".gz":
        genome_table_fh = gzip.open(data_table_fp, "rb")
    else:
        genome_table_fh = open(data_table_fp, "U")

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            # Now we want to use the OTU table information
            # to load only rows in the count table corresponding
            # to relevant OTUs

            if verbose:
                print "Loading traits for %i organisms from the trait table" % len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(), ids_to_load, axis="samples")
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(data_table_fp)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh, ids_to_load, transpose=transpose)

    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." % (
            len(genome_table.ids(axis="observation")),
            len(genome_table.ids()),
        )

    return genome_table
コード例 #4
0
def load_data_table(data_table_fp,\
  load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\
  transpose=False,verbose=False):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table

    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    if not path.exists(data_table_fp):
        raise IOError("File " + data_table_fp +
                      " doesn't exist! Did you forget to download it?")

    ext = path.splitext(data_table_fp)[1]
    if (ext == '.gz'):
        genome_table_fh = gzip.open(data_table_fp, 'rb')
    else:
        genome_table_fh = open(data_table_fp, 'U')

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if verbose:
                print "Loading traits for %i organisms from the trait table" % len(
                    ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),
                                                     ids_to_load,
                                                     axis='samples')
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(data_table_fp)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,
                                               ids_to_load,
                                               transpose=transpose)

    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." % (
            len(genome_table.ids(axis='observation')), len(genome_table.ids()))

    return genome_table
コード例 #5
0
ファイル: predict_metagenomes.py プロジェクト: adamrp/picrust
def load_data_table(data_table_fp,\
  load_data_table_in_biom=False,suppress_subset_loading=False,ids_to_load=None,\
  transpose=False,verbose=False):
    """Load a data table, detecting gziiped files and subset loading
    data_table_fp -- path to the input data table
    
    load_data_table_in_biom -- if True, load the data table as a BIOM table rather
    than as tab-delimited

    suppress_subset_loading -- if True, load the entire table, rather than just
    ids_of_interest

    ids_to_load -- a list of OTU ids for which data should be loaded

    gzipped files are detected based on the '.gz' suffix.
    """
    ext=path.splitext(data_table_fp)[1]
    if (ext == '.gz'):
        genome_table_fh = gzip.open(data_table_fp,'rb')
    else:
        genome_table_fh = open(data_table_fp,'U')

    if load_data_table_in_biom:
        if not suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load,transpose=transpose)
    
    if verbose:
        print "Done loading trait table containing %i functions for %i organisms." %(len(genome_table.ObservationIds),len(genome_table.SampleIds))

    return genome_table
コード例 #6
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
コード例 #7
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
コード例 #8
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table
    
    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations

    
    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp
        
    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
コード例 #9
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext = path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table = parse_classic_table_to_rich_table(
            open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp, 'U'))
        except ValueError:
            raise ValueError(
                "Error loading OTU table! If not in BIOM format use '-f' option.\n"
            )

    ids_to_load = otu_table.ObservationIds

    if (opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = '_'.join(
            ['16S', opts.gg_version, 'precalculated.tab.gz'])
        input_count_table = join(get_picrust_project_dir(), 'picrust', 'data',
                                 precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table, 'rb')
    else:
        count_table_fh = open(input_count_table, 'U')

    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id = count_table.ObservationIds[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table = table_factory(filtered_values,
                                       otu_table.SampleIds,
                                       filtered_otus,
                                       constructor=DenseOTUTable)

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id, x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)

    normalized_table = filtered_otu_table.normObservationByMetadata(
        opts.metadata_identifer)

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,
                                                     normalized_table,
                                                     'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,
                                                    normalized_table,
                                                    'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
コード例 #10
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir='jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    if(opts.parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(opts.parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(opts.parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(opts.verbose):
        print "Loading tree..."
        
    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]
    
    if(opts.verbose):
        print "Total number of possible tips to predict: {0}".format(len(all_tips))

    created_tmp_files=[]
    output_files={}
    output_files['counts']=[]
    if opts.reconstruction_confidence:
        output_files['variances']=[]
        output_files['upper_CI']=[]
        output_files['lower_CI']=[]

    if opts.already_calculated:
        all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(opts.verbose):
        print "Creating temporary input files in: ",tmp_dir
    
    num_tips_per_job=1000
    for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]:
        
        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str=','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base,extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base+"_variances.tab")
            output_files['upper_CI'].append(outfile_base+"_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base+"_lower_CI.tab")
            
            #create the job command
            cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp)

        else:
            cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp)
            

        #NOTE: Calculating NSTI this way is convenient, 
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd=cmd+" -a"

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if(opts.verbose):
        print "Launching parallel jobs."
        
    #run the job command
    job_prefix='picrust'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay)

    if(opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if(opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base,extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
       #Combine output files
        if opts.verbose:
            print "Combining all output files for "+ predict_type

        combined_predictions=combine_predict_trait_output(output_files[predict_type])
        
        if opts.verbose:
            print "Writing combined file for "+predict_type

        if predict_type == 'counts':
        #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table,'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions)    
        
    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
コード例 #11
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir = 'jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py')

    if (opts.parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (opts.parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (opts.parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if (opts.verbose):
        print "Loading tree..."

    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]

    if (opts.verbose):
        print "Total number of possible tips to predict: {0}".format(
            len(all_tips))

    created_tmp_files = []
    output_files = {}
    output_files['counts'] = []
    if opts.reconstruction_confidence:
        output_files['variances'] = []
        output_files['upper_CI'] = []
        output_files['lower_CI'] = []

    if opts.already_calculated:
        all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(
                len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    if (opts.verbose):
        print "Creating temporary input files in: ", tmp_dir

    num_tips_per_job = 1000
    for tips_to_predict in [
            all_tips[i:i + num_tips_per_job]
            for i in range(0, len(all_tips), num_tips_per_job)
    ]:

        #create tmp output files
        tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str = ','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base, extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base + "_variances.tab")
            output_files['upper_CI'].append(outfile_base + "_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base + "_lower_CI.tab")

            #create the job command
            cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, opts.reconstruction_confidence,
                tip_to_predict_str, tmp_output_fp)

        else:
            cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, tip_to_predict_str,
                tmp_output_fp)

        #NOTE: Calculating NSTI this way is convenient,
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd = cmd + " -a"

        #add job command to the the jobs file
        jobs.write(cmd + "\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if (opts.verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix = 'picrust'
    submit_jobs(cluster_jobs_fp,
                jobs_fp,
                job_prefix,
                num_jobs=opts.num_jobs,
                delay=opts.delay)

    if (opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if (opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base, extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
        #Combine output files
        if opts.verbose:
            print "Combining all output files for " + predict_type

        combined_predictions = combine_predict_trait_output(
            output_files[predict_type])

        if opts.verbose:
            print "Writing combined file for " + predict_type

        if predict_type == 'counts':
            #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table, 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table, 'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base + "_" + predict_type + ".biom", 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base + "_" + predict_type + ".tab",
                     'w').write(combined_predictions)

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
コード例 #12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis="observation")

    if opts.input_count_fp is None:
        # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"])
        input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if ext == ".gz":
        count_table_fh = gzip.open(input_count_table, "rb")
    else:
        count_table_fh = open(input_count_table, "U")

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    # Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iter(axis="observation"):
        ids.append(str(x[1]))

    ob_id = count_table.ids(axis="observation")[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.exists(x, axis="sample"):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis="observation"))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id, x)
        try:
            # data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation")

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])

    normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation")

    # move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation")

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
コード例 #13
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp,'U'))
        except ValueError:
            raise ValueError("Error loading OTU table! If not in BIOM format use '-f' option.\n")

    ids_to_load = otu_table.ObservationIds
    
    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]
    
    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')
       
    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh,ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id=count_table.ObservationIds[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable)

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))
            
        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}
        
    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)
            

    normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer)
    
    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,normalized_table,'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,normalized_table,'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp,'w').write(format_biom_table(normalized_table))
コード例 #14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)