Ejemplo n.º 1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        option_parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)

    if h5py.is_hdf5(opts.input_fp):
        # metadata are not deserializing correctly. Duct tape it.
        update_d = {}
        for i, md in zip(table.ids(axis='observation'),
                         table.metadata(axis='observation')):
            update_d[i] = {k: json.loads(v[0]) for k, v in md.items()}
        table.add_metadata(update_d, axis='observation')

    result = table.collapse(collapse_f, axis='observation', one_to_many=True,
                            norm=False,
                            one_to_many_md_key=opts.metadata_category)

    if(opts.format_tab_delimited):
        f = open(opts.output_fp, 'w')
        f.write(result.to_tsv(header_key=opts.metadata_category,
                              header_value=opts.metadata_category,
                              metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
Ejemplo n.º 2
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ",opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')):
        scaling_factors[sample_id]=depth    
    
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))
    
    if opts.verbose:
        print "Scaling the metagenome..."
        
    scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    open(opts.output_metagenome_table,'w').write(format_biom_table(scaled_metagenomes))
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
       
    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\
        'it\'s for backwards-compatibility.)') 

    min_args = 2
    if len(args) != min_args:
        option_parser.error('Program requires <commands file> and  <job prefix>')

    if (len(args[1])>10 or len(args[1])==0):
        option_parser.error('job prefix must be 1-10 characters long')
 
    commands = list(open(args[0]))
    job_prefix = args[1]

    if(not exists(opts.job_dir)):
        try:
            makedirs(opts.job_dir)
        except OSError:
            exit(" Jobs directory can not be created. "
                 +"Check for permissions or file with the same name: %s\n"
                 % opts.job_dir)

    if (opts.make_jobs):
        filenames = make_torque_jobs(commands, job_prefix, opts.queue, opts.job_dir,opts.num_jobs)
    else:
        exit("Should we ever get here???")
    if (opts.submit_jobs):
        submit_cluster_jobs(filenames, opts.verbose)
Ejemplo n.º 4
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\
        'it\'s for backwards-compatibility.)')

    min_args = 2
    if len(args) != min_args:
        option_parser.error(
            'Program requires <commands file> and  <job prefix>')

    if (len(args[1]) > 10 or len(args[1]) == 0):
        option_parser.error('job prefix must be 1-10 characters long')

    commands = list(open(args[0]))
    job_prefix = args[1]

    if (not exists(opts.job_dir)):
        try:
            makedirs(opts.job_dir)
        except OSError:
            exit(" Jobs directory can not be created. " +
                 "Check for permissions or file with the same name: %s\n" %
                 opts.job_dir)

    if (opts.make_jobs):
        filenames = make_sge_jobs(commands, job_prefix, opts.queue,
                                  opts.job_dir, opts.num_jobs)
    else:
        exit("Should we ever get here???")
    if (opts.submit_jobs):
        submit_cluster_jobs(filenames, opts.verbose, delay=opts.delay)
Ejemplo n.º 5
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
  
    
    metadata_file_delimiter = ','
    mapping_file_delimiter = '\t'

    #New strategy:
    #Open output file
    print "Opening output file:",opts.output_file
    outfile = open(opts.output_file,'w+')

    
    #Load old QIIME mapping file.  Parse header line, then for each data line,
    #insert new fields just before description
    print "Loading input QIIME mapping file:",opts.input_mapping_file
    mapping_file = open(opts.input_mapping_file,'U')
    
    event_column,event_state = opts.event.split(':')
    time_column = opts.time_column
    
    result = relative_date_info_from_mapping(mapping_file,time_column,event_column, event_state,individual_column="Individual")

    
    #print "Result:", result
    for l in result:
        line_to_print = "\t".join(map(str,l))+"\n"
        outfile.write(line_to_print)
        print line_to_print.strip()
    
    print "Done. Output saved to:",opts.output_file
    outfile.close()
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
Ejemplo n.º 7
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))
    if opts.verbose:
        print "Predicting the metagenome..."
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    open(opts.output_metagenome_table,'w').write(output_text)
Ejemplo n.º 8
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
Ejemplo n.º 9
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        option_parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)

    if h5py.is_hdf5(opts.input_fp):
        # metadata are not deserializing correctly. Duct tape it.
        update_d = {}
        for i, md in zip(table.ids(axis='observation'),
                         table.metadata(axis='observation')):
            update_d[i] = {k: json.loads(v[0]) for k, v in md.items()}
        table.add_metadata(update_d, axis='observation')

    result = table.collapse(collapse_f,
                            axis='observation',
                            one_to_many=True,
                            norm=False,
                            one_to_many_md_key=opts.metadata_category)

    if (opts.format_tab_delimited):
        f = open(opts.output_fp, 'w')
        f.write(
            result.to_tsv(header_key=opts.metadata_category,
                          header_value=opts.metadata_category,
                          metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
Ejemplo n.º 10
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = parse_biom_table(open(opts.input_fp))
    result = table.collapseObservationsByMetadata(
        collapse_f,
        one_to_many=True,
        norm=False,
        one_to_many_md_key=opts.metadata_category)

    f = open(opts.output_fp, 'w')

    if (opts.format_tab_delimited):
        f.write(
            result.delimitedSelf(header_key=opts.metadata_category,
                                 header_value=opts.metadata_category,
                                 metadata_formatter=lambda s: '; '.join(s)))
    else:
        f.write(result.getBiomFormatJsonString('picrust %s - categorize_by_function'\
                                           % __version__))
    f.close()
Ejemplo n.º 11
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext=path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table=parse_classic_table_to_rich_table(open(opts.input_otu_fp,'U'),None,None,None,DenseOTUTable)
    else:
        if input_ext != '.biom':
            sys.stderr.write("\nOTU table does not have '.biom' extension! If loading causes error consider using '-f' option to load tab-delimited OTU table!\n\n")
        otu_table = parse_biom_table(open(opts.input_otu_fp,'U'))

    ext=path.splitext(opts.input_count_fp)[1]
    if (ext == '.gz'):
        count_table = parse_biom_table(gzip.open(opts.input_count_fp,'rb'))
    else:
        count_table = parse_biom_table(open(opts.input_count_fp,'U'))
        
    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id=count_table.ObservationIds[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table=table_factory(filtered_values,otu_table.SampleIds,filtered_otus, constructor=DenseOTUTable)

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))
            
        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}
        
    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)
            

    normalized_table = filtered_otu_table.normObservationByMetadata(opts.metadata_identifer)

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp,'w').write(\
     normalized_table.getBiomFormatJsonString('PICRUST'))
Ejemplo n.º 12
0
def main():
    _, opts, _ = parse_command_line_parameters(**script_info)
    download_picrust_files(
        output_path=DATA_DIR,
        with_confidence=opts.with_confidence,
        gg_version=opts.gg_version,
        type_of_prediction=opts.type_of_prediction,
        force=opts.force,
    )
Ejemplo n.º 13
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',')

    #Construct a dict from user specified field order
    file_name_field_order = {}
    for i, field in enumerate(opts.field_order.split(',')):
        file_name_field_order[field] = i
        if opts.verbose:
            print "Assuming file names are in this order:", file_name_field_order
    for k in pool_by:
        #Check that we're only pooling by values that exist
        if k not in file_name_field_order.keys():
            err_text=\
            "Bad value for option '--pool_by'.  Can't pool by '%s'.   Valid categories are: %s" %(k,\
            ",".join(file_name_field_order.keys()))
            raise ValueError(err_text)

        if opts.verbose:
            print "Pooling results by:", pool_by

    file_name_delimiter = '--'
    pooled_observations,pooled_expectations = pool_test_dataset_dir(opts.trait_table_dir,\
      opts.exp_trait_table_dir,file_name_delimiter=file_name_delimiter,\
      file_name_field_order=file_name_field_order,pool_by=pool_by,\
      verbose=opts.verbose)

    #prediction_prefix = 'predict_traits'
    #expectation_prefix = 'exp_biom_traits'

    for tag in pooled_observations.keys():
        obs_table = pooled_observations[tag]
        exp_table = pooled_expectations[tag]

        #obs_table_filename = file_name_delimiter.join([prediction_prefix]+[t for t in tag.split()])
        #exp_table_filename = file_name_delimiter.join([expectation_prefix]+[t for t in tag.split()])

        obs_table_filename = file_name_delimiter.join(['predict_traits'] +
                                                      [t for t in tag.split()])
        exp_table_filename = file_name_delimiter.join(['exp_biom_table'] +
                                                      [t for t in tag.split()])

        obs_outpath = join(opts.output_dir, obs_table_filename)
        exp_outpath = join(opts.output_dir, exp_table_filename)

        print obs_outpath
        print exp_outpath

        f = open(obs_outpath, 'w')
        f.write(obs_table.delimitedSelf())
        f.close()

        f = open(exp_outpath, 'w')
        f.write(exp_table.delimitedSelf())
        f.close()
Ejemplo n.º 14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ext=path.splitext(opts.input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    if (ext == '.gz'):
        genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb'))
    else:
        genome_table = parse_biom_table(open(opts.input_count_table,'U'))

    make_output_dir_for_file(opts.output_metagenome_table)

    if opts.accuracy_metrics:
        # Calculate accuracy metrics
        #unweighted_nsti = calc_nsti(otu_table,genome_table,weighted=False)
        #print "Unweighted NSTI:", unweighted_nsti
        
        weighted_nsti = calc_nsti(otu_table,genome_table,weighted=True)
        samples= weighted_nsti[0]
        nstis = list(weighted_nsti[1])
        #print "Samples:",samples
        #print "NSTIs:",nstis
        samples_and_nstis = zip(samples,nstis)
        #print "Samples and NSTIs:",samples_and_nstis
        lines = ["#Sample\tMetric\tValue\n"]
        #print weighted_nsti
        for sample,nsti in samples_and_nstis:
            line = "%s\tWeighted NSTI\t%s\n" %(sample,str(nsti))
            lines.append(line)

        if opts.verbose:
            for l in sorted(lines):
                print l
        if opts.verbose:
            print "Writing accuracy information to file:", opts.accuracy_metrics
        open(opts.accuracy_metrics,'w').writelines(sorted(lines))

    if opts.verbose:
        print "Predicting the metagenome..."
        
    predicted_metagenomes = predict_metagenomes(otu_table,genome_table)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table
        
    make_output_dir_for_file(opts.output_metagenome_table)
    if(opts.format_tab_delimited):
        open(opts.output_metagenome_table,'w').write(predicted_metagenomes.delimitedSelf())
    else:
        open(opts.output_metagenome_table,'w').write(format_biom_table(predicted_metagenomes))
Ejemplo n.º 15
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    min_args = 1
    if len(args) < min_args:
       option_parser.error('A BIOM file must be provided.')

    file_name = args[0]

    #allow file to be optionally gzipped (must use extension '.gz')
    ext=splitext(file_name)[1]
    if (ext == '.gz'):
        table = parse_biom_table(gzip.open(file_name,'rb'))
    else:
        table = parse_biom_table(open(file_name,'U'))

    metadata_name=opts.metadata

    if metadata_name is None:
        max_len_metadata=0
    elif table.ObservationMetadata and metadata_name in table.ObservationMetadata[0]:
        #figure out the longest list within the given metadata
        max_len_metadata = max(len(p[metadata_name]) for p in table.ObservationMetadata)
    else:
        raise ValueError("'"+metadata_name+"' was not found in the BIOM table. Please try changing --metadata to a valid metadata field.")

    #make the header line
    header=[]
    #make simple labels for each level in the metadata (e.g. 'Level_1', 'Level_2', etc.) "+1" for the observation id as well.
    for i in range(max_len_metadata+1):
        header.append('Level_'+ str(i+1))
    
    #add the sample ids to the header line
    header.extend(table.SampleIds)
    
    print "\t".join(header)

    #now process each observation (row in the table)
    for obs_vals,obs_id,obs_metadata in table.iterObservations():
        row=[]
        if max_len_metadata >0:
            row=obs_metadata[metadata_name]
        
        #Add blanks if the metadata doesn't fill each level
        if len(row) < max_len_metadata:
            for i in range(max_len_metadata - len(row)):
                row.append('')

        #Add the observation id as the last "Level"
        row.append(obs_id)

        #Add count data to the row
        row.extend(map(str,obs_vals))
        print "\t".join(row)
Ejemplo n.º 16
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',') 
    
    #Construct a dict from user specified field order
    file_name_field_order = {}
    for i,field in enumerate(opts.field_order.split(',')):
        file_name_field_order[field]=i
        if opts.verbose:
            print "Assuming file names are in this order:",file_name_field_order
    for k in pool_by:
        #Check that we're only pooling by values that exist 
        if k not in file_name_field_order.keys():
            err_text=\
            "Bad value for option '--pool_by'.  Can't pool by '%s'.   Valid categories are: %s" %(k,\
            ",".join(file_name_field_order.keys()))
            raise ValueError(err_text)

        if opts.verbose:
            print "Pooling results by:",pool_by
    
    file_name_delimiter='--'
    pooled_observations,pooled_expectations = pool_test_dataset_dir(opts.trait_table_dir,\
      opts.exp_trait_table_dir,file_name_delimiter=file_name_delimiter,\
      file_name_field_order=file_name_field_order,pool_by=pool_by,\
      verbose=opts.verbose)
    
    #prediction_prefix = 'predict_traits'
    #expectation_prefix = 'exp_biom_traits'

    for tag in pooled_observations.keys():
        obs_table = pooled_observations[tag]
        exp_table = pooled_expectations[tag]

        #obs_table_filename = file_name_delimiter.join([prediction_prefix]+[t for t in tag.split()])
        #exp_table_filename = file_name_delimiter.join([expectation_prefix]+[t for t in tag.split()])
        
        obs_table_filename = file_name_delimiter.join(['predict_traits']+[t for t in tag.split()])
        exp_table_filename = file_name_delimiter.join(['exp_biom_table']+[t for t in tag.split()])
        
        obs_outpath = join(opts.output_dir,obs_table_filename)
        exp_outpath = join(opts.output_dir,exp_table_filename)

        print obs_outpath
        print exp_outpath
        
        f=open(obs_outpath,'w')
        f.write(obs_table.delimitedSelf())
        f.close()

        f=open(exp_outpath,'w')
        f.write(exp_table.delimitedSelf())
        f.close()
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    fastaInput = opts.input_dir
    fastaOut = opts.output_dir
    out = open(fastaOut, 'w')
    onlyVars = getVariable(fastaInput)    
    onlyVars[0]
    out.writelines(str(onlyVars))
    out.close()
Ejemplo n.º 18
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    start_time = datetime.now()

    t = LoadTree(opts.input_tree)
    translation_dict = {}
    for i, tip in enumerate(t.iterTips()):
        translation_dict[tip.Name] = i

    single_rate = False

    #Generate commands telling BayesTraits which nodes to reconstruct
    bayestraits_commands = make_bayestraits_script(t,
                                                   translation_dict,
                                                   comments=False,
                                                   single_rate=single_rate)

    #TODO: make this dynamic
    #Temporarily assuming there is a nexus file available
    nexus_fp = opts.input_tree.rsplit(".", 1)[0] + ".nexus"
    command_fp = "./bayestraits_commands.txt"
    path_to_bayestraits = "../"
    outfile = "./bayestrait_reconstruction.trait_table"
    command_file = open(command_fp, "w+")
    command_file.writelines(bayestraits_commands)
    command_file.close()

    command_file = open(command_fp, "U")

    bayestraits = BayesTraits()
    bayestraits_result = bayestraits(data=(nexus_fp, opts.input_trait_data,
                                           command_fp))
    #print "StdOut:",result["StdOut"].read()
    print "StdErr:", bayestraits_result["StdErr"].read()
    print "Return code:", bayestraits_result["ExitStatus"]

    results = parse_reconstruction_output(
        bayestraits_result['StdOut'].readlines())
    #print "Reconstructions:",results

    #Reconstruction results
    f = open(outfile, "w+")
    f.writelines(results)
    f.close()

    end_time = datetime.now()
    print "Start time:", start_time
    print "End time:", end_time
    print "Time to reconstruct:", end_time - start_time
    bayestraits_result.cleanUp()
Ejemplo n.º 19
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    otu_table = parse_biom_table(open(opts.input_path,'U'))
    tree = DndParser(open(opts.tree_path),UniFracTreeNode)
    dic = otu_table._data
    #A = dict_to_csmat(dic)
    A = dic
    otus_id = otu_table.ObservationIds
    if opts.metrics=='unweighted':
        print unifrac_mix(A,otus_id,tree)
    if opts.metrics=='weighted':
        s = sum_dict(dic)
        print unifrac_mix_weighted(A,otus_id,tree,s)
Ejemplo n.º 20
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = parse_biom_table(open(opts.input_path, 'U'))
    tree = DndParser(open(opts.tree_path), UniFracTreeNode)
    dic = otu_table._data
    #A = dict_to_csmat(dic)
    A = dic
    otus_id = otu_table.ObservationIds
    if opts.metrics == 'unweighted':
        print unifrac_mix(A, otus_id, tree)
    if opts.metrics == 'weighted':
        s = sum_dict(dic)
        print unifrac_mix_weighted(A, otus_id, tree, s)
Ejemplo n.º 21
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\
        'it\'s for backwards-compatibility.)')

    min_args = 2
    if len(args) < min_args:
        option_parser.error('Exactly two arguments are required.')

    output_dir = './'
    run_commands(output_dir,open(args[0]).readlines(),args[1],\
     submit_jobs=opts.submit_jobs,\
     keep_temp=True,num_jobs=opts.num_jobs)
Ejemplo n.º 22
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
       
    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error('Must pass -m if passing -s. (Sorry about this, '+\
        'it\'s for backwards-compatibility.)') 

    min_args = 2
    if len(args) < min_args:
       option_parser.error('Exactly two arguments are required.')
       
    output_dir = './'
    run_commands(output_dir,open(args[0]).readlines(),args[1],\
     submit_jobs=opts.submit_jobs,\
     keep_temp=True,num_jobs=opts.num_jobs)
Ejemplo n.º 23
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    start_time = datetime.now()

    t = LoadTree(opts.input_tree)
    translation_dict = {}
    for i,tip in enumerate(t.iterTips()):
        translation_dict[tip.Name] = i

    single_rate = False

    #Generate commands telling BayesTraits which nodes to reconstruct
    bayestraits_commands = make_bayestraits_script(t,translation_dict,comments=False,single_rate=single_rate)


    #TODO: make this dynamic
    #Temporarily assuming there is a nexus file available
    nexus_fp = opts.input_tree.rsplit(".",1)[0] +".nexus"
    command_fp = "./bayestraits_commands.txt"
    path_to_bayestraits = "../"
    outfile = "./bayestrait_reconstruction.trait_table"
    command_file = open(command_fp,"w+")
    command_file.writelines(bayestraits_commands)
    command_file.close()

    command_file = open(command_fp,"U")

    bayestraits=BayesTraits()
    bayestraits_result = bayestraits(data=(nexus_fp,opts.input_trait_data,command_fp))
    #print "StdOut:",result["StdOut"].read()
    print "StdErr:",bayestraits_result["StdErr"].read()
    print "Return code:",bayestraits_result["ExitStatus"]

    results = parse_reconstruction_output(bayestraits_result['StdOut'].readlines())
    #print "Reconstructions:",results

    #Reconstruction results
    f = open(outfile,"w+")
    f.writelines(results)
    f.close()

    end_time = datetime.now()
    print "Start time:", start_time
    print "End time:",end_time
    print "Time to reconstruct:", end_time - start_time
    bayestraits_result.cleanUp()
Ejemplo n.º 24
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    tr = parse_newick(open(opts.tree_fp),PhyloNode)
    tip_dists, all_nodes = tr.tipToTipDistances() # tipTo returns a list of actual node objects
    all_ids = [node.Name for node in all_nodes]
    

    o = open(opts.taxa_fp)
    group_ids = [i.strip() for i in o.readline().split(',')]
    o.close()
    # check that there are at least 2 ids in the group, otherwise the math fails
    if len(group_ids) < 2:
        option_parser.error('you must have at least 2 taxa specified' +\
         ' in the taxa file or the math will fail.')

    # make sure specified taxa are in the tree, break at first failure
    for i in group_ids:
        try:
            all_ids.index(i)
        except ValueError:
            option_parser.error('Taxa '+i+' not found in the tree. You may'+\
                ' have specified an internal node.')

    if len(all_ids)==len(group_ids): #m ust be the same set of ids if above check passes
        option_parser.error('The taxa_ids you specified contain every tip'+\
            ' in the tree. The NRI and NTI formulas will fail with these values'+\
            ' because there is no standard deviation of mpd or mntd, and thus'+\
            ' division by zero will occur. In addition, the concept of over/under'+\
            ' dispersion of a group of taxa (what NRI/NTI measure) is done in'+\
            ' reference to the tree they are a part of. If the group being tested'+\
            ' is the entire tree, the idea of over/under dispersion does not make'+\
            ' much sense.')

    # mapping from string of method name to function handle
    method_lookup = {'nri':nri, 'nti':nti}

    methods = opts.methods.split(',')
    for method in methods:
       if method not in method_lookup:
           option_parser.error("unknown method: %s; valid methods are: %s" % (method, ', '.join(method_lookup.keys())))
    
    for method in methods:
       print method+':', method_lookup[method](tip_dists, all_ids, group_ids, iters=opts.iters)
Ejemplo n.º 25
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    tr = parse_newick(open(opts.tree_fp),PhyloNode)
    tip_dists, all_nodes = tr.tipToTipDistances() #all_nodes is list node objs
    all_ids = [node.Name for node in all_nodes]
    
    o = open(opts.taxa_fp)
    group_ids = [i.strip() for i in o.readline().split(',')]
    o.close()
    # check that there are at least 2 ids in the group, otherwise the math fails
    if len(group_ids) < 2:
        option_parser.error('Not enough taxa in the taxa file.You must have '+\
         ' at least 2 taxa specified' +\
         ' in the taxa file or the standard deviation of the distance will '+\
         ' be zero, causing both NRI and NTI to fail.')
    # check that all_ids contains every group_id
    if not set(group_ids).issubset(all_ids):
        raise option_parser.error('There are taxa in the taxa file which are '+\
            'not found in the tree. You may have specified an internal node.')
    # check that all_ids != group_ids
    if len(all_ids)==len(group_ids): #must be same set if above passes
        option_parser.error('The taxa_ids you specified contain every tip'+\
            ' in the tree. The NRI and NTI formulas will fail '+\
            ' because there is no standard deviation of mpd or mntd, and thus'+\
            ' division by zero. In addition, the concept of over/under'+\
            ' dispersion of a group of taxa (what NRI/NTI measure) is done in'+\
            ' reference to the tree they are a part of. If the group being'+\
            ' tested is the entire tree, the idea of over/under dispersion '+\
            ' makes little sense.')

    # mapping from string of method name to function handle
    method_lookup = {'nri':nri, 'nti':nti}

    methods = opts.methods.split(',')
    for method in methods:
        if method not in method_lookup:
            option_parser.error("Unknown method: %s; valid methods are: %s" % \
                (method, ', '.join(method_lookup.keys())))
    
    for method in methods:
        print method+':', method_lookup[method](tip_dists, all_ids, group_ids, 
            iters=opts.iters)
Ejemplo n.º 26
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    suppress_errors = opts.suppress_errors

    input_fps = []
    for input_fp in opts.input_fps.split(','):
        input_fps.extend(glob(input_fp))

    for input_fp in input_fps:
        i = 0
        try:
            input_f = open(input_fp, 'U')
        except IOError, e:
            if suppress_errors:
                continue
            else:
                print input_fp, e
        for s in MinimalFastaParser(input_f):
            i += 1
        print input_fp, i
Ejemplo n.º 27
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level, 
                                 opts.ignore)
    table = parse_biom_table(open(opts.input_fp))
    result = table.collapseObservationsByMetadata(collapse_f, one_to_many=True, 
                          norm=False,one_to_many_md_key=opts.metadata_category)

    f = open(opts.output_fp,'w')

    if(opts.format_tab_delimited):
        f.write(result.delimitedSelf(header_key="KEGG Pathways",header_value="KEGG Pathways",metadata_formatter=lambda s: '; '.join(s)))
    else:
        f.write(result.getBiomFormatJsonString('picrust %s - categorize_by_function'\
                                           % __version__))
    f.close()
Ejemplo n.º 28
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    suppress_errors = opts.suppress_errors

    input_fps = []
    for input_fp in opts.input_fps.split(','):
        input_fps.extend(glob(input_fp))

    for input_fp in input_fps:
        i = 0
        try:
            input_f = open(input_fp,'U')
        except IOError,e:
            if suppress_errors:
                continue
            else:
                print input_fp, e
        for s in MinimalFastaParser(input_f):
            i += 1
        print input_fp, i
Ejemplo n.º 29
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ", opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id, depth in parse_seq_count_file(open(opts.input_seq_depth_file, "U")):
        scaling_factors[sample_id] = depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table, scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ", opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
Ejemplo n.º 30
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)
    result = table.collapse(collapse_f, axis='observation', one_to_many=True,
                          norm=False,one_to_many_md_key=opts.metadata_category)


    if(opts.format_tab_delimited):
        f = open(opts.output_fp,'w')
        f.write(result.to_tsv(header_key=opts.metadata_category,
                              header_value=opts.metadata_category,
                              metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
Ejemplo n.º 31
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    if opts.verbose:
        print "Loading sequencing depth table: ",opts.input_seq_depth_file
    scaling_factors = {}
    for sample_id,depth in parse_seq_count_file(open(opts.input_seq_depth_file,'U')):
        scaling_factors[sample_id]=depth

    if opts.verbose:
        print "Loading count table: ", opts.input_count_table
    genome_table = load_table(opts.input_count_table)

    if opts.verbose:
        print "Scaling the metagenome..."

    scaled_metagenomes = scale_metagenomes(genome_table,scaling_factors)

    if opts.verbose:
        print "Writing results to output file: ",opts.output_metagenome_table

    make_output_dir_for_file(opts.output_metagenome_table)
    write_biom_table(scaled_metagenomes, opts.output_metagenome_table)
Ejemplo n.º 32
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
   
    
    count(opts)
Ejemplo n.º 33
0
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir,strict=False)
    output_table_fp = join(output_dir,trait_table_base)
    output_tree_fp = join(output_dir,pruned_tree_base)
    output_reference_tree_fp = join(output_dir,reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space":" ","tab":"\t","comma":","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" %("Tree file",tree_file)
        print "\t%s:%s" %("Trait table",trait_table_fp)
        print "\t%s:%s" %("Output tree",output_tree_fp)
        print "\t%s:%s" %("Output reference tree",output_reference_tree_fp)
        print "\t%s:%s" %("Output trait table",output_table_fp)
        print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root)
        print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus)
        print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter)
        print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp,"U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping,"U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy=new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)



    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp,"U")
        otu_table_lines = otu_table.readlines()
        header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)


        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)


    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp,"w+")
    output_tree_file  = open(output_tree_fp,"w+")
    output_reference_tree_file  = open(output_reference_tree_fp,"w+")


    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str,lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()


    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
Ejemplo n.º 34
0
script_info['script_description'] = """If input_tgz has one file: extract it an rename it as output_path.
If input_tgz has multiple files: extract them in a directory named output_path.
If input_tgz is not a tgz file (must be a file, not a directory): rename the input file as output_path"""
script_info['script_usage'] = [("Example:", "Extract the content of the tgz file named 'in.tgz' into the directory 'out_dir'", "%prog -i in.tgz -o out_dir")]
script_info['output_description'] = ""
script_info['required_options'] = [
	make_option('-i', '--input_tgz', type="existing_filepath",
				help='File path for the tgz file to uncompress'),
	make_option('-o', '--output_path', type="new_path",
				help='Path where to extract the contents of the tgz file')
]
script_info['optional_options'] = []
script_info['version'] = __version__

def extract_if_is_tgz(tgz_fp, output_path):
	try:
		extract_from_tgz(tgz_fp, output_path)
	except ValueError, e:
		# The input
		if str(e) == ERROR_MSG:
			copyfile(tgz_fp, output_path)
		else:
			raise ValueError, e


if __name__ == '__main__':
	option_parser, opts, args = parse_command_line_parameters(**script_info)
	tgz_fp = opts.input_tgz
	output_path = opts.output_path

	extract_if_is_tgz(tgz_fp, output_path)
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir, strict=False)
    output_table_fp = join(output_dir, trait_table_base)
    output_tree_fp = join(output_dir, pruned_tree_base)
    output_reference_tree_fp = join(output_dir, reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space": " ", "tab": "\t", "comma": ","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" % ("Tree file", tree_file)
        print "\t%s:%s" % ("Trait table", trait_table_fp)
        print "\t%s:%s" % ("Output tree", output_tree_fp)
        print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp)
        print "\t%s:%s" % ("Output trait table", output_table_fp)
        print "\t%s:%s" % ("Add branch length to root",
                           opts.add_branch_length_to_root)
        print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus)
        print "\t%s:%s" % ("Input trait table delimiter",
                           opts.input_table_delimiter)
        print "\t%s:%s" % ("Output trait table delimiter",
                           opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"
    #format_for_bayestraits = True
    #TODO: this will become a new function in the bayestraits app controller
    #if format_for_bayestraits:
    #    convert_to_nexus = True
    #    convert_to_bifurcating = True
    #    filter_table_by_tree_tips = True
    #    filter_tree_by_table_entries = True
    #    enforce_min_branch_length = True
    #    convert_trait_floats_to_ints = True

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))
    #input_tree =DndParser(open(tree_file), constructor=PicrustNode)

    #input_tree = load_picrust_tree(opts.input_tree,opts.verbose)

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp, "U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError(
            "No lines could be loaded from file %s. Please check the input file."
            % trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping, "U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy = new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)

    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp, "U")
        otu_table_lines = otu_table.readlines()
        header_line, otu_table_fields = parse_trait_table(
            otu_table_lines, delimiter=input_delimiter, has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)

        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(
            new_reference_tree_copy, tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)

    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp, "w+")
    output_tree_file = open(output_tree_fp, "w+")
    output_reference_tree_file = open(output_reference_tree_fp, "w+")

    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str, lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()

    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(
        new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
Ejemplo n.º 36
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir = 'jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py')

    if (opts.parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (opts.parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (opts.parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if (opts.verbose):
        print "Loading tree..."

    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]

    if (opts.verbose):
        print "Total number of possible tips to predict: {0}".format(
            len(all_tips))

    created_tmp_files = []
    output_files = {}
    output_files['counts'] = []
    if opts.reconstruction_confidence:
        output_files['variances'] = []
        output_files['upper_CI'] = []
        output_files['lower_CI'] = []

    if opts.already_calculated:
        all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(
                len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    if (opts.verbose):
        print "Creating temporary input files in: ", tmp_dir

    num_tips_per_job = 1000
    for tips_to_predict in [
            all_tips[i:i + num_tips_per_job]
            for i in range(0, len(all_tips), num_tips_per_job)
    ]:

        #create tmp output files
        tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str = ','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base, extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base + "_variances.tab")
            output_files['upper_CI'].append(outfile_base + "_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base + "_lower_CI.tab")

            #create the job command
            cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, opts.reconstruction_confidence,
                tip_to_predict_str, tmp_output_fp)

        else:
            cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, tip_to_predict_str,
                tmp_output_fp)

        #NOTE: Calculating NSTI this way is convenient,
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd = cmd + " -a"

        #add job command to the the jobs file
        jobs.write(cmd + "\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if (opts.verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix = 'picrust'
    submit_jobs(cluster_jobs_fp,
                jobs_fp,
                job_prefix,
                num_jobs=opts.num_jobs,
                delay=opts.delay)

    if (opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if (opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base, extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
        #Combine output files
        if opts.verbose:
            print "Combining all output files for " + predict_type

        combined_predictions = combine_predict_trait_output(
            output_files[predict_type])

        if opts.verbose:
            print "Writing combined file for " + predict_type

        if predict_type == 'counts':
            #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table, 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table, 'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base + "_" + predict_type + ".biom", 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base + "_" + predict_type + ".tab",
                     'w').write(combined_predictions)

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
  
    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table
    
    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')
    
    #In the genome/trait table genomes are the samples and 
    #genes are the observations

    
    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs
           
            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = parse_biom_table(genome_table_fh.read())
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions)
    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp
        
    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)
Ejemplo n.º 38
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if opts.verbose:
        print "Loading tree from file:", opts.tree
    
    # Load Tree
    #tree = LoadTree(opts.tree)
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers =[]
    traits={}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
            
            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output)
            brownian_motion_parameter = params['sigma'][0]
            brownian_motion_error = params['sigma'][1]
            if opts.verbose:
                print "Done. Loaded %i confidence interval values." %(len(asr_max_vals))
                print "Brownian motion parameter:",brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)


    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)
        
    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"
   
    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits,tree, trait_label=trait_label)

    
    if opts.reconstruction_confidence: 
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")


    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]
    
    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True,True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]
        
        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)
        
        
        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))
        
        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict))
    
    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table,"U")
        #Parse OTU table for ids
        
        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")
        
        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))
        
        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.output_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)
            
            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {'NSTI': min_distances[organism]}
        
            if opts.verbose:
                print "NSTI:", nsti_result
   
        #Write accuracy metrics to file
        if opts.verbose:
            print "Writing accuracy metrics to file:",opts.output_accuracy_metrics
   
        f = open(opts.output_accuracy_metrics,'w+')
        lines = ["metric\torganism\tvalue\n"]
        for organism in accuracy_metric_results.keys():
            for metric in accuracy_metric_results[organism].keys():
                lines.append('\t'.join([metric,organism,\
                  str(accuracy_metric_results[organism][metric])])+'\n')
        f.writelines(sorted(lines))
        f.close()


    if opts.verbose:
        print "Generating predictions using method:",opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)
    elif opts.weighting_method == 'linear':
        #Linear weight function
        weight_fn = linear_weight
    elif opts.weighting_method == 'equal_weight':
        weight_fn = equal_weight

    variances=None #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting': 
  
        if opts.reconstruction_confidence:
        # Perform predictions using reconstructed ancestral states
            predictions,variances =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              use_self_in_prediction = True,\
              weight_fn =weight_fn,verbose=opts.verbose)
    
        else:
             predictions =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              use_self_in_prediction = True,\
              weight_fn =weight_fn,verbose=opts.verbose)
    
    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          use_self_in_prediction = True,\
          weight_fn =weight_fn,verbose=opts.verbose)
        


    elif opts.prediction_method == 'nearest_neighbor':
        
        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,\
          use_self_in_prediction = True, tips_only = True)

    elif opts.prediction_method == 'random_neighbor':
        
        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label,\
          use_self_in_prediction = True)
    else:
        error_template =\
          "Prediction method '%s' is not supported.  Valid methods are: %s'"
        
        error_text = error_template %(opts.prediction_method,\
          ", ".join(METHOD_CHOICES))

    if opts.verbose:
        print "Converting results to .biom format for output..."
    #convert to biom format (and transpose)
    biom_predictions=biom_table_from_predictions(predictions,table_headers)
    #In the .biom table, organisms are 'samples' and traits are 'observations 
    #(by analogy with a metagenomic sample)
    
    #Therefore, we associate the trait variances with the per-observation metadata
    
    #print "variances:",variances
    #print "BIOM observations:", [o for o in biom_predictions.iterObservations()] 
    #print "BIOM samples:", [s for s in biom_predictions.iterSamples()] 
    
    if variances is not None:
        if opts.verbose:
            print "Adding variance information to output .biom table, as per-observation metadata with key 'variance'..."
        biom_predictions.addSampleMetadata(variances)
    
    if accuracy_metric_results is not None:
        if opts.verbose:
            print "Adding accuracy metrics (%s) to biom table as per-observation metadata..." %(",".join(accuracy_metrics))
        biom_predictions.addSampleMetadata(accuracy_metric_results)
        
    #Add variance information as per observation metadata
    
    if opts.verbose:
        print "Writing biom format prediction results to file: ",opts.output_trait_table
    #write biom table to file
    make_output_dir_for_file(opts.output_trait_table)
    open(opts.output_trait_table,'w').write(\
     format_biom_table(biom_predictions))
Ejemplo n.º 39
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',')


    #create output directory
    make_output_dir(opts.output_dir)

    #Construct a dict from user specified field order
    file_name_field_order = {}
    for i,field in enumerate(opts.field_order.split(',')):
        file_name_field_order[field]=i
        if opts.verbose:
            print "Assuming file names are in this order:",file_name_field_order

    for k in pool_by:
        #Check that we're only pooling by values that exist
        if k not in file_name_field_order.keys():
            err_text=\
              "Bad value for option '--pool_by'.  Can't pool by '%s'.   Valid categories are: %s" %(k,\
              ",".join(file_name_field_order.keys()))
            raise ValueError(err_text)

    if opts.verbose:
        print "Pooling results by:",pool_by

    roc_success_criteria = ['binary','exact','int_exact']

    scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\
      evaluate_test_dataset_dir(opts.trait_table_dir,\
      opts.exp_trait_table_dir,file_name_delimiter="--",\
      file_name_field_order=file_name_field_order,pool_by=pool_by,\
      roc_success_criteria=roc_success_criteria,verbose=opts.verbose)

    #Output scatter data

    output_fp = join(opts.output_dir,'evaluation_scatter_data.tab')
    if opts.verbose:
        print "Writing scatter plot data to:",output_fp
    file_lines = scatter_lines

    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output correlation data

    output_fp = join(opts.output_dir,'evaluation_correlation_data.tab')

    if opts.verbose:
        print "Writing correlation data to:",output_fp

    file_lines = correlation_lines

    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output raw ROC plot data
    if opts.verbose:
        print "Writing ROC data..."
    for c in roc_result_lines.keys():
        output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c)
        if opts.verbose:
            print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp)
        file_lines = roc_result_lines[c]

        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()

    #Output summary ROC AUC data
    if opts.verbose:
        print "Writing ROC AUC data..."

    for c in roc_auc_lines.keys():
        output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c)
        file_lines = roc_auc_lines[c]

        if opts.verbose:
            print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp)
        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()
Ejemplo n.º 40
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir = opts.input_dir
    output_dir = opts.output_dir or input_dir
    tmp_dir = opts.tmp_dir or output_dir
    parallel_method = opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method

    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError(
            'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method'
        )

    if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']:
        raise ValueError(
            "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods"
        )

    if opts.verbose:
        print "Reconstruction method:", asr_method
        print "Prediction method:", predict_traits_method
        print "Parallel method:", parallel_method
        print "num_jobs:", opts.num_jobs
        print "\nOutput will be saved here:'%s'" % output_dir

    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if (parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files = glob(join(input_dir, 'exp_traits--*'))

    test_datasets = {}
    for file_name in expect_test_files:
        test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id] = [
            join(input_dir, 'test_trait_table--' + test_id),
            join(input_dir, 'test_tree--' + test_id),
            join(input_dir, 'exp_traits--' + test_id)
        ]

    created_tmp_files = []
    output_files = []

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(), 'scripts',
                         'ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts',
                                    'predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id)
        asr_params_out_fp = join(
            output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(
                asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp
            remove(asr_out_fp)

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(
                    asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % (
                test_id, asr_out_fp)
        else:
            #create the asr command
            asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(
                asr_script_fp, test_datasets[test_id][0],
                test_datasets[test_id][1], asr_method, asr_out_fp,
                asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))

        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(
                predict_traits_out_fp) and file_contains_nulls(
                    predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(
                    predict_traits_out_fp)
            continue

        output_files.append(predict_traits_out_fp)

        genome_id = split('--', test_id)[2]

        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(
                predict_traits_script_fp, test_datasets[test_id][0],
                opts.ref_tree, genome_id, predict_traits_out_fp,
                predict_traits_method)
            jobs.write(predict_traits_cmd + "\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' % (asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param

            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

            #add job command to the the jobs file
            jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix = 'eval_'

    if opts.verbose:
        print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs
    submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
Ejemplo n.º 41
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table, "U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))

    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
        print "Ensuring tree and trait table labels are formatted consistently..."

    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)

    fix_tree_labels(tree, label_conversion_fns)

    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    if opts.limit_to_tips:

        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" % (
                len(included_tips), included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]

    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False

    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)

    if opts.verbose:
        print "Writing files for test  datasets"

    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:

        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" % (
                        tip_to_predict, included_tips)
                continue

        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'" % tip_to_predict

        #Write tree
        base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist]))
        curr_filepath = write_tree(opts.output_dir, base_name, test_tree,
                                   safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath

        #Write expected trait table
        base_name = "--".join(
            map(str,
                ["exp_traits", opts.method, curr_dist, safe_tip_to_predict]))

        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits) + "\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename

        f = open(filename, "w")
        f.write("".join(exp_trait_table_lines))
        f.close()

        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename, "U"))
        fields = [f for f in fields]  #converts generator to list

        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")

        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(
            ["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)

        #Write BIOM format expected trait table
        base_name = "--".join(
            map(str, [
                "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict
            ]))

        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")

        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename

        f = open(filename, "w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(
            ["\t".join(r) + "\n" for r in test_trait_table_fields])

        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(
            map(str, [
                "test_trait_table", opts.method, curr_dist, safe_tip_to_predict
            ]))
        filename = os.path.join(opts.output_dir, base_name)

        if opts.verbose:
            print "Writing test trait table to:", filename

        f = open(filename, "w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
Ejemplo n.º 42
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_ext = path.splitext(opts.input_otu_fp)[1]
    if opts.input_format_classic:
        otu_table = parse_classic_table_to_rich_table(
            open(opts.input_otu_fp, 'U'), None, None, None, DenseOTUTable)
    else:
        try:
            otu_table = parse_biom_table(open(opts.input_otu_fp, 'U'))
        except ValueError:
            raise ValueError(
                "Error loading OTU table! If not in BIOM format use '-f' option.\n"
            )

    ids_to_load = otu_table.ObservationIds

    if (opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = '_'.join(
            ['16S', opts.gg_version, 'precalculated.tab.gz'])
        input_count_table = join(get_picrust_project_dir(), 'picrust', 'data',
                                 precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table, 'rb')
    else:
        count_table_fh = open(input_count_table, 'U')

    if opts.load_precalc_file_in_biom:
        count_table = parse_biom_table(count_table_fh.read())
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iterObservations():
        ids.append(str(x[1]))

    ob_id = count_table.ObservationIds[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.sampleExists(x):
            filtered_otus.append(x)
            filtered_values.append(otu_table.observationData(x))

    #filtered_values = map(list,zip(*filtered_values))
    filtered_otu_table = table_factory(filtered_values,
                                       otu_table.SampleIds,
                                       filtered_otus,
                                       constructor=DenseOTUTable)

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.getValueByIds(ob_id, x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.addObservationMetadata(copy_numbers_filtered)

    normalized_table = filtered_otu_table.normObservationByMetadata(
        opts.metadata_identifer)

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table,
                                                     normalized_table,
                                                     'ObservationMetadata')
    normalized_otu_table = transfer_sample_metadata(otu_table,
                                                    normalized_table,
                                                    'SampleMetadata')

    make_output_dir_for_file(opts.output_otu_fp)
    open(opts.output_otu_fp, 'w').write(format_biom_table(normalized_table))
Ejemplo n.º 43
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    print_picrust_config()
Ejemplo n.º 44
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and opts.suppress_script_usage_tests):
       option_parser.error("You're suppressing both test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of PICRUSt's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root,name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    if not opts.suppress_script_usage_tests:
        try:
            from qiime.test import run_script_usage_tests
        except ImportError:
            print "QIIME not installed so not running script tests."
            opts.suppress_script_usage_tests=True
        else:
            test_data_dir = join(get_picrust_project_dir(),'picrust_test_data')
            scripts_dir  = join(get_picrust_project_dir(),'scripts')
            if opts.script_usage_tests != None:
                script_usage_tests = opts.script_usage_tests.split(',')
            else:
                script_usage_tests = None

            # Run the script usage testing functionality
            script_usage_result_summary, num_script_usage_example_failures = \
                    run_script_usage_tests(
                    test_data_dir=test_data_dir,
                    scripts_dir=scripts_dir,
                    working_dir='/tmp/',
                    verbose=True,
                    tests=script_usage_tests,
                    force_overwrite=True,
                    timeout=300)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)

        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
            "to missing external applications.\nDepending on the PICRUSt features "+\
            "you plan to use, this may not be critical.\n%s"\
             % '\n'.join(missing_application_tests)

        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_usage_tests:
        print "\nScript usage test result summary\n------------------------------------\n"
        print script_usage_result_summary
        print ""

    # If script usage tests weren't suppressed,we can't have any failures.
    script_usage_tests_success = (opts.suppress_script_usage_tests or
                                  num_script_usage_example_failures == 0)

    # If any of the unit tests or script usage tests fail, or if we have any
    # missing application errors, use return code 1 (as python's unittest
    # module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
        script_usage_tests_success):
        return_code = 0
    return return_code
def main():

    # INPUT COMMAND LINE OPTIONS
    voption_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    ########### VOCAL ###########
    if opts.verbose:
        print script_info['script_description']
        print "####################################### INITIALIZATION #####################################"
        print date_print()
        print "Verbose: " + str(opts.verbose)
        print "PWD:    " + os.getcwd()

    ########## OUTPUT ###########
    # If output folder specified: make output folder
    if opts.output_fp:
        out_bool, out_path = dir_pipe(opts.output_fp, overwrite=opts.overwrite)
        if (out_bool == True) and (opts.verbose == True):
            print "Output: " + dir_fullpath(out_path)  # print output
        elif opts.verbose == True:
            print "Output: Could not be created: " + out_path  # if not found then

    ### PRINT MODULES ###
    #ref_modules(justModuleIn=True, justFunctionIn=True)
    ### PRINT FUNCTIONS ###
    #ref_modules(justModuleIn=False, justFunctionIn=True)
    ### PRINT FUNCTIONS & DESCRIPTIONS ###
    #ref_modules(justModuleIn=False, justFunctionIn=False)

    ########## INPUT ############
    if opts.verbose:
        print "########################################## INPUT [-i] ######################################"
    input_input_fps = input_pipe(opts.input_fps,
                                 optTitle="[-i --input_fps]",
                                 verboseIn=opts.verbose)

    ######### CUSTOM ############
    if opts.verbose:
        print "########################################### CUSTOM #########################################"
    if input_input_fps: print_enumlist(input_input_fps)

    # GET PANDAS FILES IMPORTED #
    pdInputs = []
    for inF in input_input_fps:
        pdInputs.append(pd_ui(inF))

    # STOCHASTIC FIRST, THEN HOST-MICROBE TREE COMPARISON #
    pd_summarize(pdInputs[0], locIn="Stochatic Comparison")
    pd_summarize(pdInputs[1], locIn="Host-Microbe Comparison")

    metricIn = "MatchingCluster"

    hostMicrobeScore = pdInputs[1][metricIn][0]
    print
    print "Host-Microbe Score:   " + str(hostMicrobeScore)
    print
    print "Better Score:         " + str(
        len(pdInputs[0][pdInputs[0][metricIn] < hostMicrobeScore]))
    print "Worse Score:          " + str(
        len(pdInputs[0][pdInputs[0][metricIn] > hostMicrobeScore]))
    print "Equiv Score:          " + str(
        len(pdInputs[0][pdInputs[0][metricIn] == hostMicrobeScore]))
    print "P-value better:       " + str(
        float(len(pdInputs[0][pdInputs[0][metricIn] < hostMicrobeScore])) /
        100000.0)
    print
    print "Better\Equal Score:   " + str(
        len(pdInputs[0][pdInputs[0][metricIn] <= hostMicrobeScore]))
    print "Worse Score:          " + str(
        len(pdInputs[0][pdInputs[0][metricIn] > hostMicrobeScore]))
    print "P-value Better/Equal: " + str(
        float(len(pdInputs[0][pdInputs[0][metricIn] <= hostMicrobeScore])) /
        100000.0)
    print
    print "Max Stochastic Metric:  " + str(max(pdInputs[0][metricIn]))

    #############################
    if opts.verbose:
        print "############################################ END ###########################################"
        print "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\//////////////////////////////////////////////"
Ejemplo n.º 46
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics=True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    if opts.no_round:
        round_opt = False 
    else:
        round_opt = True

    # Load Tree
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers=[]
    traits={}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:",opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." %(len(asr_max_vals))
                print "Brownian motion parameter:",brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)


    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits,tree, trait_label=trait_label)


    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

             if opts.verbose:
                 print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
             brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                      upper_bound_trait_label="upper_bound",\
                      lower_bound_trait_label="lower_bound",\
                      trait_label=trait_label,\
                      confidence=0.95)
             if opts.verbose:
                 print "Inferred the following rate parameters:",brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True,True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)


        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" %(len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table,"U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" %(len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" %([",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {'NSTI': min_distances[organism]}

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:",opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only,'w+')
            f.write("metric\torganism\tvalue\n")
            lines =[]
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()


    if opts.verbose:
        print "Generating predictions using method:",opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances=None #Overwritten by methods that calc variance
    confidence_intervals=None #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn=weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

        else:
             predictions =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              weight_fn =weight_fn,verbose=opts.verbose,
              round_predictions=round_opt)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)



    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh=open(opts.output_trait_table,'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ",opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix='.biom'
        else:
            suffix='.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base,extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base+"_variances"+suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:",variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))


    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base+"_upper_CI"+suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:",upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base,extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base+"_lower_CI"+suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file",lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
Ejemplo n.º 47
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ", opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation').tolist()

    # Determine whether user wants predictions round to nearest whole
    # number or not.
    if opts.no_round:
        round_flag = False
    else:
        round_flag = True

    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." \
          %(len(otu_table.ids()),len(otu_table.ids(axis='observation')))

    #Hardcoded loaction of the precalculated datasets for PICRUSt,
    #relative to the project directory
    precalc_data_dir = join(get_picrust_project_dir(), 'picrust', 'data')

    # Load a table of gene counts by OTUs.
    #This can be either user-specified or precalculated
    genome_table_fp = determine_data_table_fp(precalc_data_dir,\
      opts.type_of_prediction,opts.gg_version,\
      user_specified_table=opts.input_count_table,verbose=opts.verbose)

    if opts.verbose:
        print "Loading gene count data from file: %s" % genome_table_fp

    genome_table= load_data_table(genome_table_fp,\
      load_data_table_in_biom=opts.load_precalc_file_in_biom,\
      suppress_subset_loading=opts.suppress_subset_loading,\
      ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True)

    if opts.verbose:
        print "Loaded %i genes across %i OTUs from gene count table" \
          %(len(genome_table.ids(axis='observation')),len(genome_table.ids()))

    if opts.with_confidence:
        if opts.input_variance_table:
            variance_table_fp = opts.input_variance_table
        else:
            variance_table_fp = determine_data_table_fp(precalc_data_dir,\
              opts.type_of_prediction,opts.gg_version,\
              precalc_file_suffix='precalculated_variances.tab.gz',\
              user_specified_table=opts.input_count_table)

        if opts.verbose:
            print "Loading variance information from table: %s" \
            %variance_table_fp

        variance_table= load_data_table(variance_table_fp,\
          load_data_table_in_biom=opts.load_precalc_file_in_biom,\
          suppress_subset_loading=opts.suppress_subset_loading,\
          ids_to_load=ids_to_load,transpose=True)

        if opts.verbose:
            print "Loaded %i genes across %i OTUs from variance table" \
              %(len(variance_table.ids(axis='observation')),len(variance_table.ids()))
        #Raise an error if the genome table and variance table differ
        #in the genomes they contain.
        #better to find out now than have something obscure happen latter on
        if opts.verbose:
            print "Checking that genome table and variance table are consistent"
        try:
            assert set(variance_table.ids(axis='observation')) == set(
                genome_table.ids(axis='observation'))
        except AssertionError, e:
            for var_id in variance_table.ids(axis='observation'):
                if var_id not in genome_table.ids(axis='observation'):
                    print "Variance table ObsId %s not in genome_table ObsIds" % var_id
            raise AssertionError(
                "Variance table and genome table contain different gene ids")
        try:
            assert set(variance_table.ids()) == set(genome_table.ids())
        except AssertionError, e:
            for var_id in variance_table.ids():
                if var_id not in genome_table.ids():
                    print "Variance table SampleId %s not in genome_table SampleIds" % var_id
            raise AssertionError(
                "Variance table and genome table contain different OTU ids")
Ejemplo n.º 48
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if (opts.suppress_unit_tests and opts.suppress_script_usage_tests):
       option_parser.error("You're suppressing both test types. Nothing to run.")

    test_dir = abspath(dirname(__file__))

    unittest_good_pattern = re.compile('OK\s*$')
    application_not_found_pattern = re.compile('ApplicationNotFoundError')
    python_name = 'python'
    bad_tests = []
    missing_application_tests = []

    # Run through all of PICRUSt's unit tests, and keep track of any files which
    # fail unit tests.
    if not opts.suppress_unit_tests:
        unittest_names = []
        if not opts.unittest_glob:
            for root, dirs, files in walk(test_dir):
                for name in files:
                    if name.startswith('test_') and name.endswith('.py'):
                        unittest_names.append(join(root,name))
        else:
            for fp in glob(opts.unittest_glob):
                fn = split(fp)[1]
                if fn.startswith('test_') and fn.endswith('.py'):
                    unittest_names.append(abspath(fp))

        unittest_names.sort()

        for unittest_name in unittest_names:
            print "Testing %s:\n" % unittest_name
            command = '%s %s -v' % (python_name, unittest_name)
            stdout, stderr, return_value = system_call(command)
            print stderr
            if not unittest_good_pattern.search(stderr):
                if application_not_found_pattern.search(stderr):
                    missing_application_tests.append(unittest_name)
                else:
                    bad_tests.append(unittest_name)

    if not opts.suppress_script_usage_tests:  
        try:
            from qiime.test import run_script_usage_tests
        except ImportError:
            print "QIIME not installed so not running script tests."
            opts.suppress_script_usage_tests=True
        else:
            test_data_dir = join(get_picrust_project_dir(),'picrust_test_data')
            scripts_dir  = join(get_picrust_project_dir(),'scripts')
            if opts.script_usage_tests != None:
                script_usage_tests = opts.script_usage_tests.split(',')
            else:
                script_usage_tests = None

            # Run the script usage testing functionality
                script_usage_result_summary, num_script_usage_example_failures = \
                    run_script_usage_tests(
                    qiime_test_data_dir=test_data_dir,
                    qiime_scripts_dir=scripts_dir,
                    working_dir='/tmp/',
                    verbose=True,
                    tests=script_usage_tests,
                    failure_log_fp=None,
                    force_overwrite=True)

    print "==============\nResult summary\n=============="

    if not opts.suppress_unit_tests:
        print "\nUnit test result summary\n------------------------\n"
        if bad_tests:
            print "\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)
    
        if missing_application_tests:
            print "\nFailed the following unit tests, in part or whole due "+\
            "to missing external applications.\nDepending on the PICRUSt features "+\
            "you plan to use, this may not be critical.\n%s"\
             % '\n'.join(missing_application_tests)
        
        if not (missing_application_tests or bad_tests):
            print "\nAll unit tests passed.\n\n"

    if not opts.suppress_script_usage_tests:
        print "\nScript usage test result summary\n------------------------------------\n"
        print script_usage_result_summary
        print ""

    # If script usage tests weren't suppressed,we can't have any failures.
    script_usage_tests_success = (opts.suppress_script_usage_tests or
                                  num_script_usage_example_failures == 0)

    # If any of the unit tests or script usage tests fail, or if we have any
    # missing application errors, use return code 1 (as python's unittest
    # module does to indicate one or more failures).
    return_code = 1
    if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and
        script_usage_tests_success):
        return_code = 0
    return return_code
Ejemplo n.º 49
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    min_args = 1
    if len(args) < min_args:
        option_parser.error(
            'One or more predicted biom files must be provided.')
    observed_files = args

    make_output_dir_for_file(opts.output_fp)
    out_fh = open(opts.output_fp, 'w')

    if verbose:
        print "Loading expected trait table file:", opts.exp_trait_table_fp

    exp_table = load_table(opts.exp_trait_table_fp)

    header_printed = False
    header_keys = []
    delimiter = "\t"

    for observed_file in observed_files:
        observed_file_name = basename(observed_file)

        if verbose:
            print "Loading predicted trait table file:", observed_file_name

        obs_table = load_table(observed_file)

        if opts.compare_observations:
            if verbose:
                print "Transposing tables to allow evaluation of observations (instead of samples)..."
            obs_table = obs_table.transpose()
            exp_table = exp_table.transpose()

        if verbose:
            print "Matching predicted and expected tables..."

        obs, exp = match_biom_tables(
            obs_table,
            exp_table,
            verbose=verbose,
            limit_to_expected_observations=opts.limit_to_expected_observations,
            limit_to_observed_observations=opts.limit_to_observed_observations,
            normalize=opts.normalize,
            shuffle_samples=opts.shuffle_samples)

        if verbose:
            print "Calculating accuracy stats for all observations..."

        #import pdb; pdb.set_trace()
        for i in obs:
            if verbose:
                print "Calculating stats for: ", i
            if opts.not_relative_abundance_scores:
                results = calculate_accuracy_stats_from_observations(
                    obs[i], exp[i], success_criterion='binary')
            else:
                results = calculate_accuracy_stats_from_observations(
                    obs[i], exp[i], success_criterion='ra_exact')

            #If first pass then print out header
            if not header_printed:
                header_printed = True
                header_keys = sorted(results.keys())
                out_fh.write(
                    delimiter.join(['file', 'label'] + header_keys) + "\n")

            #print results using same order as header
            values = [observed_file_name, i
                      ] + ['{0:.3g}'.format(results[x]) for x in header_keys]
            out_str = delimiter.join(map(str, values)) + "\n"
            out_fh.write(out_str)
Ejemplo n.º 50
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading OTU table: ",opts.input_otu_table

    otu_table = parse_biom_table(open(opts.input_otu_table,'U'))
    ids_to_load = otu_table.ObservationIds
    
    if opts.verbose:
        print "Done loading OTU table containing %i samples and %i OTUs." \
          %(len(otu_table.SampleIds),len(otu_table.ObservationIds))
    
    #Hardcoded loaction of the precalculated datasets for PICRUSt,
    #relative to the project directory
    precalc_data_dir=join(get_picrust_project_dir(),'picrust','data')

    # Load a table of gene counts by OTUs.
    #This can be either user-specified or precalculated
    genome_table_fp = determine_data_table_fp(precalc_data_dir,\
      opts.type_of_prediction,opts.gg_version,\
      user_specified_table=opts.input_count_table,verbose=opts.verbose)

    if opts.verbose:
        print "Loading gene count data from file: %s" %genome_table_fp
    
    genome_table= load_data_table(genome_table_fp,\
      load_data_table_in_biom=opts.load_precalc_file_in_biom,\
      suppress_subset_loading=opts.suppress_subset_loading,\
      ids_to_load=ids_to_load,verbose=opts.verbose,transpose=True)
  
    if opts.verbose:
        print "Loaded %i genes across %i OTUs from gene count table" \
          %(len(genome_table.ObservationIds),len(genome_table.SampleIds))
    
    if opts.with_confidence:
        if opts.input_variance_table:
            variance_table_fp = opts.input_variance_table
        else:
            variance_table_fp = determine_data_table_fp(precalc_data_dir,\
              opts.type_of_prediction,opts.gg_version,\
              precalc_file_suffix='precalculated_variances.tab.gz',\
              user_specified_table=opts.input_count_table)

        if opts.verbose:
            print "Loading variance information from table: %s" \
            %variance_table_fp
        
        variance_table= load_data_table(variance_table_fp,\
          load_data_table_in_biom=opts.load_precalc_file_in_biom,\
          suppress_subset_loading=opts.suppress_subset_loading,\
          ids_to_load=ids_to_load,transpose=True)
        
        if opts.verbose:
            print "Loaded %i genes across %i OTUs from variance table" \
              %(len(variance_table.ObservationIds),len(variance_table.SampleIds))
        #Raise an error if the genome table and variance table differ
        #in the genomes they contain.
        #better to find out now than have something obscure happen latter on
        if opts.verbose:
            print "Checking that genome table and variance table are consistent"
        try:
            assert set(variance_table.ObservationIds) == set(genome_table.ObservationIds) 
        except AssertionError,e:
            for var_id in variance_table.ObservationIds:
                if var_id not in genome_table.ObservationIds:
                    print "Variance table ObsId %s not in genome_table ObsIds" %var_id
            raise AssertionError("Variance table and genome table contain different gene ids")
        try:
            assert set(variance_table.SampleIds) == set(genome_table.SampleIds)
        except AssertionError,e:
            for var_id in variance_table.SampleIds:
                if var_id not in genome_table.SampleIds:
                    print "Variance table SampleId %s not in genome_table SampleIds" %var_id
            raise AssertionError("Variance table and genome table contain different OTU ids")
Ejemplo n.º 51
0
    from optparse import OptionParser, make_option
    options = [
     make_option('-i','--biom_fp',type="string",
                 help='the BIological Observation Matrix filepath'),
     make_option('-a','--axis', type='string',
                  help="The axis to subset over, either 'samples' or 'observations'"),
     make_option('-s','--ids_fp',type="string",
                 help="A file containing a single column of IDs to retain"),
     make_option('-o','--output_fp',type="string",
                 help="A file to write the result to")
    ]
    
if __name__ == '__main__':
    if cogent_cl_parsing:
        option_parser, opts, args =\
                     parse_command_line_parameters(**script_info)
    else:
        parser = OptionParser(option_list=options)
        opts, args = parser.parse_args()

    ids = [l.strip() for l in open(opts.ids_fp)]
    biom_str = open(opts.biom_fp).read()

    idxs, new_axis_md = get_axis_indices(biom_str, ids, opts.axis)
    new_data = direct_slice_data(biom_str, idxs, opts.axis)
    output = open(opts.output_fp,'w')

    # multiple walks over the file. bad form, but easy right now
    # ...should add a yield_and_ignore parser or something.
    output.write('{')
    output.write(direct_parse_key(biom_str, "id"))
Ejemplo n.º 52
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table,"U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))
   
    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests 
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
       print "Ensuring tree and trait table labels are formatted consistently..."
   
    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)
    
    fix_tree_labels(tree,label_conversion_fns)
    
    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    
    if opts.limit_to_tips:
        
        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]
     
    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False
    
    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)
    
    if opts.verbose:
        print "Writing files for test  datasets"
    
    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:    
        
        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips)
                continue


        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'"%tip_to_predict

        #Write tree
        base_name = "--".join(map(str,["test_tree",opts.method,curr_dist]))
        curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath
        
        #Write expected trait table
        base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict]))
                
        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits)+"\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(exp_trait_table_lines))
        f.close()
        
        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence 
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename,"U"))
        fields = [f for f in fields] #converts generator to list    
        
        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")
       
        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)
        
        #Write BIOM format expected trait table
        base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict]))
        
        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")
                
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename
        
        f=open(filename,"w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

       
        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields])
        
        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict]))
        filename=os.path.join(opts.output_dir,base_name)
        
        if opts.verbose:
            print "Writing test trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
Ejemplo n.º 53
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #if we specify we want NSTI only then we have to calculate it first
    if opts.output_accuracy_metrics_only:
        opts.calculate_accuracy_metrics = True

    if opts.verbose:
        print "Loading tree from file:", opts.tree

    # Load Tree
    #tree = LoadTree(opts.tree)
    tree = load_picrust_tree(opts.tree, opts.verbose)

    table_headers = []
    traits = {}
    #load the asr trait table using the previous list of functions to order the arrays
    if opts.reconstructed_trait_table:
        table_headers,traits =\
                update_trait_dict_from_file(opts.reconstructed_trait_table)

        #Only load confidence intervals on the reconstruction
        #If we actually have ASR values in the analysis
        if opts.reconstruction_confidence:
            if opts.verbose:
                print "Loading ASR confidence data from file:",\
                opts.reconstruction_confidence
                print "Assuming confidence data is of type:", opts.confidence_format

            asr_confidence_output = open(opts.reconstruction_confidence)
            asr_min_vals,asr_max_vals, params,column_mapping =\
              parse_asr_confidence_output(asr_confidence_output,format=opts.confidence_format)
            if 'sigma' in params:
                brownian_motion_parameter = params['sigma'][0]
            else:
                brownian_motion_parameter = None

            if opts.verbose:
                print "Done. Loaded %i confidence interval values." % (
                    len(asr_max_vals))
                print "Brownian motion parameter:", brownian_motion_parameter
        else:
            brownian_motion_parameter = None

    #load the trait table into a dict with organism names as keys and arrays as functions
    table_headers,genome_traits =\
            update_trait_dict_from_file(opts.observed_trait_table,table_headers)

    #Combine the trait tables overwriting the asr ones if they exist in the genome trait table.
    traits.update(genome_traits)

    # Specify the attribute where we'll store the reconstructions
    trait_label = "Reconstruction"

    if opts.verbose:
        print "Assigning traits to tree..."

    # Decorate tree using the traits
    tree = assign_traits_to_tree(traits, tree, trait_label=trait_label)

    if opts.reconstruction_confidence:
        if opts.verbose:
            print "Assigning trait confidence intervals to tree..."
        tree = assign_traits_to_tree(asr_min_vals,tree,\
            trait_label="lower_bound")

        tree = assign_traits_to_tree(asr_max_vals,tree,\
            trait_label="upper_bound")

        if brownian_motion_parameter is None:

            if opts.verbose:
                print "No Brownian motion parameters loaded. Inferring these from 95% confidence intervals..."
            brownian_motion_parameter = get_brownian_motion_param_from_confidence_intervals(tree,\
                     upper_bound_trait_label="upper_bound",\
                     lower_bound_trait_label="lower_bound",\
                     trait_label=trait_label,\
                     confidence=0.95)
            if opts.verbose:
                print "Inferred the following rate parameters:", brownian_motion_parameter
    if opts.verbose:
        print "Collecting list of nodes to predict..."

    #Start by predict all tip nodes.
    nodes_to_predict = [tip.Name for tip in tree.tips()]

    if opts.verbose:
        print "Found %i nodes to predict." % len(nodes_to_predict)

    if opts.limit_predictions_to_organisms:
        organism_id_str = opts.limit_predictions_to_organisms
        ok_organism_ids = organism_id_str.split(',')
        ok_organism_ids = [n.strip() for n in ok_organism_ids]
        for f in set_label_conversion_fns(True, True):
            ok_organism_ids = [f(i) for i in ok_organism_ids]

        if opts.verbose:
            print "Limiting predictions to user-specified ids:",\
              ",".join(ok_organism_ids)

        if not ok_organism_ids:
            raise RuntimeError(\
              "Found no valid ids in input: %s. Were comma-separated ids specified on the command line?"\
              % opts.limit_predictions_to_organisms)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in ok_organism_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by user-specified ids resulted in an empty set of nodes to predict.   Are the ids on the commmand-line and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],ok_organism_ids[0]))

        if opts.verbose:
            print "After filtering organisms to predict by the ids specified on the commandline, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    if opts.limit_predictions_by_otu_table:
        if opts.verbose:
            print "Limiting predictions to ids in user-specified OTU table:",\
              opts.limit_predictions_by_otu_table
        otu_table = open(opts.limit_predictions_by_otu_table, "U")
        #Parse OTU table for ids

        otu_ids =\
          extract_ids_from_table(otu_table.readlines(),delimiter="\t")

        if not otu_ids:
            raise RuntimeError(\
              "Found no valid ids in input OTU table: %s.  Is the path correct?"\
              % opts.limit_predictions_by_otu_table)

        nodes_to_predict =\
          [n for n in nodes_to_predict if n in otu_ids]

        if not nodes_to_predict:
            raise RuntimeError(\
              "Filtering by OTU table resulted in an empty set of nodes to predict.   Are the OTU ids and tree ids in the same format?  Example tree tip name: %s, example OTU id name: %s" %([tip.Name for tip in tree.tips()][0],otu_ids[0]))

        if opts.verbose:
            print "After filtering by OTU table, %i nodes remain to be predicted" % (
                len(nodes_to_predict))

    # Calculate accuracy of PICRUST for the given tree, sequenced genomes
    # and set of ndoes to predict
    accuracy_metrics = ['NSTI']
    accuracy_metric_results = None
    if opts.calculate_accuracy_metrics:
        if opts.verbose:
            print "Calculating accuracy metrics: %s" % (
                [",".join(accuracy_metrics)])
        accuracy_metric_results = {}
        if 'NSTI' in accuracy_metrics:

            nsti_result,min_distances =\
                calc_nearest_sequenced_taxon_index(tree,\
                limit_to_tips = nodes_to_predict,\
                trait_label = trait_label, verbose=opts.verbose)

            #accuracy_metric_results['NSTI'] = nsti_result
            for organism in min_distances.keys():
                accuracy_metric_results[organism] = {
                    'NSTI': min_distances[organism]
                }

            if opts.verbose:
                print "NSTI:", nsti_result

        if opts.output_accuracy_metrics_only:
            #Write accuracy metrics to file
            if opts.verbose:
                print "Writing accuracy metrics to file:", opts.output_accuracy_metrics

            f = open(opts.output_accuracy_metrics_only, 'w+')
            f.write("metric\torganism\tvalue\n")
            lines = []
            for organism in accuracy_metric_results.keys():
                for metric in accuracy_metric_results[organism].keys():
                    lines.append('\t'.join([metric,organism,\
                      str(accuracy_metric_results[organism][metric])])+'\n')
            f.writelines(sorted(lines))
            f.close()
            exit()

    if opts.verbose:
        print "Generating predictions using method:", opts.prediction_method

    if opts.weighting_method == 'exponential':
        #For now, use exponential weighting
        weight_fn = make_neg_exponential_weight_fn(e)

    variances = None  #Overwritten by methods that calc variance
    confidence_intervals = None  #Overwritten by methods that calc variance

    if opts.prediction_method == 'asr_and_weighting':
        # Perform predictions using reconstructed ancestral states

        if opts.reconstruction_confidence:
            predictions,variances,confidence_intervals =\
              predict_traits_from_ancestors(tree,nodes_to_predict,\
              trait_label=trait_label,\
              lower_bound_trait_label="lower_bound",\
              upper_bound_trait_label="upper_bound",\
              calc_confidence_intervals = True,\
              brownian_motion_parameter=brownian_motion_parameter,\
              weight_fn =weight_fn,verbose=opts.verbose)

        else:
            predictions =\
             predict_traits_from_ancestors(tree,nodes_to_predict,\
             trait_label=trait_label,\
             weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'weighting_only':
        #Ignore ancestral information
        predictions =\
          weighted_average_tip_prediction(tree,nodes_to_predict,\
          trait_label=trait_label,\
          weight_fn =weight_fn,verbose=opts.verbose)

    elif opts.prediction_method == 'nearest_neighbor':

        predictions = predict_nearest_neighbor(tree,nodes_to_predict,\
          trait_label=trait_label,tips_only = True)

    elif opts.prediction_method == 'random_neighbor':

        predictions = predict_random_neighbor(tree,\
          nodes_to_predict,trait_label=trait_label)

    if opts.verbose:
        print "Done making predictions."

    make_output_dir_for_file(opts.output_trait_table)

    out_fh = open(opts.output_trait_table, 'w')
    #Generate the table of biom predictions
    if opts.verbose:
        print "Converting results to .biom format for output..."

    biom_predictions=biom_table_from_predictions(predictions,table_headers,\
                                                         observation_metadata=None,\
                                                         sample_metadata=accuracy_metric_results,convert_to_int=False)
    if opts.verbose:
        print "Writing prediction results to file: ", opts.output_trait_table

    if opts.output_precalc_file_in_biom:

        #write biom table to file
        write_biom_table(biom_predictions, opts.output_trait_table)

    else:
        #convert to precalc (tab-delimited) format

        out_fh = open(opts.output_trait_table, 'w')
        out_fh.write(convert_biom_to_precalc(biom_predictions))
        out_fh.close()

    #Write out variance information to file
    if variances:

        if opts.verbose:
            print "Converting variances to BIOM format"

        if opts.output_precalc_file_in_biom:
            suffix = '.biom'
        else:
            suffix = '.tab'

        biom_prediction_variances=biom_table_from_predictions({k:v['variance'] for k,v in variances.iteritems()},table_headers,\
        observation_metadata=None,\
        sample_metadata=None,convert_to_int=False)
        outfile_base, extension = splitext(opts.output_trait_table)
        variance_outfile = outfile_base + "_variances" + suffix
        make_output_dir_for_file(variance_outfile)

        if opts.verbose:
            print "Writing variance information to file:", variance_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_variances, variance_outfile)
        else:
            open(variance_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_variances))

    if confidence_intervals:

        if opts.verbose:
            print "Converting upper confidence interval values to BIOM format"

        biom_prediction_upper_CI=biom_table_from_predictions({k:v['upper_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        upper_CI_outfile = outfile_base + "_upper_CI" + suffix
        make_output_dir_for_file(upper_CI_outfile)

        if opts.verbose:
            print "Writing upper confidence limit information to file:", upper_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_upper_CI, upper_CI_outfile)
        else:
            open(upper_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_upper_CI))

        biom_prediction_lower_CI=biom_table_from_predictions({k:v['lower_CI'] for k,v in confidence_intervals.iteritems()},table_headers,\
          observation_metadata=None,\
          sample_metadata=None,convert_to_int=False)

        outfile_base, extension = splitext(opts.output_trait_table)
        lower_CI_outfile = outfile_base + "_lower_CI" + suffix
        make_output_dir_for_file(lower_CI_outfile)

        if opts.verbose:
            print "Writing lower confidence limit information to file", lower_CI_outfile

        if opts.output_precalc_file_in_biom:
            write_biom_table(biom_prediction_lower_CI, lower_CI_outfile)
        else:
            open(lower_CI_outfile,'w').write(\
                convert_biom_to_precalc(biom_prediction_lower_CI))
Ejemplo n.º 54
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Ejemplo n.º 55
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    verbose=opts.verbose

    min_args = 1
    if len(args) < min_args:
       option_parser.error('One or more predicted biom files must be provided.')
    observed_files=args
   

    make_output_dir_for_file(opts.output_fp)
    out_fh=open(opts.output_fp,'w')

    if verbose:
        print "Loading expected trait table file:",opts.exp_trait_table_fp

    exp_table =parse_biom_table(open(opts.exp_trait_table_fp,'U'))

    header_printed=False
    header_keys=[]
    delimiter="\t"


    for observed_file in observed_files:
        observed_file_name=basename(observed_file)

        if verbose:
            print "Loading predicted trait table file:",observed_file_name

        obs_table =parse_biom_table(open(observed_file,'U'))

        if opts.compare_observations:
            if verbose:
                print "Transposing tables to allow evaluation of observations (instead of samples)..."
            obs_table=transpose_biom(obs_table)
            exp_table=transpose_biom(exp_table)

        if verbose:
           print "Matching predicted and expected tables..."    

        obs,exp=match_biom_tables(obs_table,exp_table,verbose=verbose,limit_to_expected_observations=opts.limit_to_expected_observations,limit_to_observed_observations=opts.limit_to_observed_observations,normalize=opts.normalize,shuffle_samples=opts.shuffle_samples)
           
        if verbose:
            print "Calculating accuracy stats for all observations..."

        #import pdb; pdb.set_trace()
        for i in obs:
            if verbose:
                print "Calculating stats for: ",i
            if opts.not_relative_abundance_scores:
                results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='binary')
            else:
                results=calculate_accuracy_stats_from_observations(obs[i],exp[i],success_criterion='ra_exact')

            #If first pass then print out header
            if not header_printed:
                header_printed=True
                header_keys=sorted(results.keys())
                out_fh.write(delimiter.join(['file','label']+header_keys)+"\n")

            #print results using same order as header
            values=[observed_file_name,i]+['{0:.3g}'.format(results[x]) for x in header_keys]
            out_str=delimiter.join(map(str,values))+"\n"
            out_fh.write(out_str)
Ejemplo n.º 56
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir=opts.input_dir
    output_dir=opts.output_dir or input_dir
    tmp_dir=opts.tmp_dir or output_dir
    parallel_method=opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method
    
    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method')
        
    if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']:
        raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods")

    if opts.verbose:
        print "Reconstruction method:",asr_method
        print "Prediction method:",predict_traits_method
        print "Parallel method:",parallel_method
        print "num_jobs:",opts.num_jobs
        print "\nOutput will be saved here:'%s'" %output_dir 
    
    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError


    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files=glob(join(input_dir,'exp_traits--*')) 

    test_datasets={}
    for file_name in expect_test_files:
        test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)]
    
    created_tmp_files=[]    
    output_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id)
        asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp
            remove(asr_out_fp)
        

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp)
        else:
            #create the asr command
            asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))
        
        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp)
            continue
        
        output_files.append(predict_traits_out_fp)

        genome_id=split('--',test_id)[2]
        
        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method)
            jobs.write(predict_traits_cmd+"\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' %(asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param
        
            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

        

 
            #add job command to the the jobs file
            jobs.write(asr_cmd+';'+predict_traits_cmd+"\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix='eval_'
    
    if opts.verbose:
        print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
Ejemplo n.º 57
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)


    if opts.limit_to_function:
        limit_to_functions = opts.limit_to_function.split(',')
        if opts.verbose:
            print "Limiting output to only functions:",limit_to_functions
    else:
        limit_to_functions = []

    if opts.verbose:
        print "Loading otu table: ",opts.input_otu_table

    otu_table = load_table(opts.input_otu_table)
    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_table is None):
        #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_table

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if opts.verbose:
        print "Loading count table: ", input_count_table

    if (ext == '.gz'):
        genome_table_fh = gzip.open(input_count_table,'rb')
    else:
        genome_table_fh = open(input_count_table,'U')

    #In the genome/trait table genomes are the samples and
    #genes are the observations


    if opts.load_precalc_file_in_biom:
        if not opts.suppress_subset_loading:
            #Now we want to use the OTU table information
            #to load only rows in the count table corresponding
            #to relevant OTUs

            if opts.verbose:
                print "Loading traits for %i organisms from the trait table" %len(ids_to_load)

            genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples')
        else:
            if opts.verbose:
                print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage"
            genome_table = load_table(genome_table_fh)
    else:
        genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load)
    ok_functional_categories = None

    metadata_type = None
    if opts.limit_to_functional_categories:
        ok_functional_categories = opts.limit_to_functional_categories.split("|")
        if opts.verbose:
            print "Limiting to functional categories: %s" %(str(ok_functional_categories))

        # Either KEGG_Pathways or COG_Category needs
        # to be assigned to metadata_key to limit to
        # functional categories (not needed for 
        # individual functions) 

        if opts.type_of_prediction == "ko":
            metadata_type = "KEGG_Pathways"
        elif opts.type_of_prediction == "cog":
            metadata_type = "COG_Category"
        elif opts.type_of_prediction == "rfam":
            exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)")
              
    partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\
      limit_to_functional_categories = ok_functional_categories ,  metadata_key = metadata_type )

    output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes])
    if opts.verbose:
        print "Writing results to output file: ",opts.output_fp

    make_output_dir_for_file(opts.output_fp)
    open(opts.output_fp,'w').write(output_text)