def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
Exemple #2
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)
    
    if(opts.parallel):
        tmp_dir='jobs/'
        make_output_dir(tmp_dir)
        asr_table, ci_table =run_asr_in_parallel(tree=opts.input_tree_fp,table=opts.input_trait_table_fp,asr_method=opts.asr_method,parallel_method=opts.parallel_method, num_jobs=opts.num_jobs,tmp_dir=tmp_dir,verbose=opts.verbose)
    else:
        #call the apporpriate ASR app controller 
        if(opts.asr_method == 'wagner'):
            asr_table = wagner_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'bayestraits'):
            pass
        elif(opts.asr_method == 'ace_ml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'ML',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_pic'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'pic',HALT_EXEC=opts.debug)
        elif(opts.asr_method == 'ace_reml'):
            asr_table,ci_table = ace_for_picrust(opts.input_tree_fp,opts.input_trait_table_fp,'REML',HALT_EXEC=opts.debug)


    #output the table to file
    make_output_dir_for_file(opts.output_fp)
    asr_table.writeToFile(opts.output_fp,sep='\t')

    #output the CI file (unless the method is wagner)
    if not (opts.asr_method == 'wagner'):
        make_output_dir_for_file(opts.output_ci_fp)
        ci_table.writeToFile(opts.output_ci_fp,sep='\t')
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir = opts.input_dir
    output_dir = opts.output_dir or input_dir
    tmp_dir = opts.tmp_dir or output_dir
    parallel_method = opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method

    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError(
            'You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method'
        )

    if opts.with_confidence and asr_method not in ['ace_ml', 'ace_reml']:
        raise ValueError(
            "PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods"
        )

    if opts.verbose:
        print "Reconstruction method:", asr_method
        print "Prediction method:", predict_traits_method
        print "Parallel method:", parallel_method
        print "num_jobs:", opts.num_jobs
        print "\nOutput will be saved here:'%s'" % output_dir

    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if (parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files = glob(join(input_dir, 'exp_traits--*'))

    test_datasets = {}
    for file_name in expect_test_files:
        test_id = file_name.replace(join(input_dir, 'exp_traits--'), '', 1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id] = [
            join(input_dir, 'test_trait_table--' + test_id),
            join(input_dir, 'test_tree--' + test_id),
            join(input_dir, 'exp_traits--' + test_id)
        ]

    created_tmp_files = []
    output_files = []

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(), 'scripts',
                         'ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(), 'scripts',
                                    'predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp = join(output_dir, 'asr--' + asr_method + '--' + test_id)
        asr_params_out_fp = join(
            output_dir, '--'.join(['asr', asr_method, 'asr_params', test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(
                asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: " + asr_out_fp
            remove(asr_out_fp)

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(
                    asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" % (
                test_id, asr_out_fp)
        else:
            #create the asr command
            asr_cmd = """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(
                asr_script_fp, test_datasets[test_id][0],
                test_datasets[test_id][1], asr_method, asr_out_fp,
                asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))

        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(
                predict_traits_out_fp) and file_contains_nulls(
                    predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: " + predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(
                    predict_traits_out_fp)
            continue

        output_files.append(predict_traits_out_fp)

        genome_id = split('--', test_id)[2]

        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd = """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(
                predict_traits_script_fp, test_datasets[test_id][0],
                opts.ref_tree, genome_id, predict_traits_out_fp,
                predict_traits_method)
            jobs.write(predict_traits_cmd + "\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' % (asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param

            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' % (predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

            #add job command to the the jobs file
            jobs.write(asr_cmd + ';' + predict_traits_cmd + "\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix = 'eval_'

    if opts.verbose:
        print "Submitting jobs:", cluster_jobs_fp, jobs_fp, job_prefix, opts.num_jobs
    submit_jobs(cluster_jobs_fp, jobs_fp, job_prefix, num_jobs=opts.num_jobs)
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    #set some defaults for the options
    input_dir=opts.input_dir
    output_dir=opts.output_dir or input_dir
    tmp_dir=opts.tmp_dir or output_dir
    parallel_method=opts.parallel_method
    asr_method = opts.asr_method
    predict_traits_method = opts.prediction_method
    
    if opts.num_jobs > 20 and parallel_method == 'multithreaded':
        raise ValueError('You probably dont want to run multithreaded evaluations with a large num_jobs. Please adjust options num_jobs and or parallel_method')
        
    if opts.with_confidence and asr_method not in ['ace_ml','ace_reml']:
        raise ValueError("PICRUST currently only supports confidence intervals with the ace_ml and ace_reml ASR methods")

    if opts.verbose:
        print "Reconstruction method:",asr_method
        print "Prediction method:",predict_traits_method
        print "Parallel method:",parallel_method
        print "num_jobs:",opts.num_jobs
        print "\nOutput will be saved here:'%s'" %output_dir 
    
    #create the output directory unless it already exists
    make_output_dir(output_dir)

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError


    #get the test datasets to run in the input directory (based on exp_traits files)
    expect_test_files=glob(join(input_dir,'exp_traits--*')) 

    test_datasets={}
    for file_name in expect_test_files:
        test_id=file_name.replace(join(input_dir,'exp_traits--'),'',1)
        #create a dict with the test files as values in the ref list
        test_datasets[test_id]=[ join(input_dir,'test_trait_table--'+test_id),join(input_dir,'test_tree--'+test_id),join(input_dir,'exp_traits--'+test_id)]
    
    created_tmp_files=[]    
    output_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    #get location of scripts we need to run
    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')
    predict_traits_script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    #run each test dataset through the pipeline
    for test_id in test_datasets:

        asr_out_fp=join(output_dir,'asr--'+asr_method+'--'+test_id)
        asr_params_out_fp=join(output_dir,'--'.join(['asr',asr_method,'asr_params',test_id]))
        created_tmp_files.append(asr_out_fp)

        if opts.check_for_null_files and exists(asr_out_fp) and file_contains_nulls(asr_out_fp):
            #remove file
            if opts.verbose:
                print "Existing ASR file contains null characters. Will run ASR again after removing: "+asr_out_fp
            remove(asr_out_fp)
        

        if exists(asr_out_fp) and not opts.force:
            if opts.verbose:
                print "Output file: {0} already exists, so we will skip it.".format(asr_out_fp)
            asr_cmd = "echo 'Skipping ASR for %s, file %s exists already'" %(test_id,asr_out_fp)
        else:
            #create the asr command
            asr_cmd= """python {0} -i "{1}" -t "{2}" -m {3} -o "{4}" -c "{5}" """.format(asr_script_fp, test_datasets[test_id][0], test_datasets[test_id][1], asr_method, asr_out_fp, asr_params_out_fp)

        predict_traits_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
          opts.weighting_method,test_id]))
        
        if opts.with_accuracy:
            predict_traits_accuracy_out_fp=join(output_dir,'--'.join(['predict_traits',predict_traits_method,\
              opts.weighting_method,'accuracy_metrics',test_id]))

        if opts.check_for_null_files and exists(predict_traits_out_fp) and file_contains_nulls(predict_traits_out_fp):
            if opts.verbose:
                print "Existing trait predictions file contains null characters. Will run it again after removing: "+predict_traits_out_fp
            remove(predict_traits_out_fp)

        if exists(predict_traits_out_fp) and not opts.force:
            if opts.verbose:
                print "Prediction file: {0} already exists. Skipping ASR and prediction for this organism".format(predict_traits_out_fp)
            continue
        
        output_files.append(predict_traits_out_fp)

        genome_id=split('--',test_id)[2]
        
        if predict_traits_method == 'nearest_neighbor':
            #don't do asr step
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -g "{3}" -o "{4}" -m "{5}" """.format(predict_traits_script_fp, test_datasets[test_id][0], opts.ref_tree, genome_id, predict_traits_out_fp,predict_traits_method)
            jobs.write(predict_traits_cmd+"\n")
        else:

            #create the predict traits command
            predict_traits_cmd= """python {0} -i "{1}" -t "{2}" -r "{3}" -g "{4}" -o "{5}" -m "{6}" -w {7} """.format(predict_traits_script_fp,\
            test_datasets[test_id][0], opts.ref_tree, asr_out_fp,genome_id, predict_traits_out_fp,predict_traits_method,opts.weighting_method)

            #Instruct predict_traits to use confidence intervals output by ASR
            if opts.with_confidence:
                confidence_param = ' -c "%s"' %(asr_params_out_fp)
                predict_traits_cmd = predict_traits_cmd + confidence_param
        
            #Instruct predict traits to output the NTSI measure of distance to
            #nearby sequences.

            if opts.with_accuracy:
                accuracy_param = ' -a "%s"' %(predict_traits_accuracy_out_fp)
                predict_traits_cmd = predict_traits_cmd + accuracy_param

        

 
            #add job command to the the jobs file
            jobs.write(asr_cmd+';'+predict_traits_cmd+"\n")

    jobs.close()

    #created_tmp_files.extend(output_files)

    #submit the jobs
    job_prefix='eval_'
    
    if opts.verbose:
        print "Submitting jobs:",cluster_jobs_fp,jobs_fp,job_prefix,opts.num_jobs
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs)
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir,strict=False)
    output_table_fp = join(output_dir,trait_table_base)
    output_tree_fp = join(output_dir,pruned_tree_base)
    output_reference_tree_fp = join(output_dir,reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space":" ","tab":"\t","comma":","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" %("Tree file",tree_file)
        print "\t%s:%s" %("Trait table",trait_table_fp)
        print "\t%s:%s" %("Output tree",output_tree_fp)
        print "\t%s:%s" %("Output reference tree",output_reference_tree_fp)
        print "\t%s:%s" %("Output trait table",output_table_fp)
        print "\t%s:%s" %("Add branch length to root",opts.add_branch_length_to_root)
        print "\t%s:%s" %("Convert to NEXUS?",opts.convert_to_nexus)
        print "\t%s:%s" %("Input trait table delimiter",opts.input_table_delimiter)
        print "\t%s:%s" %("Output trait table delimiter",opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp,"U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError("No lines could be loaded from file %s. Please check the input file." %trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping,"U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy=new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)



    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp,"U")
        otu_table_lines = otu_table.readlines()
        header_line,otu_table_fields =parse_trait_table(otu_table_lines,delimiter = input_delimiter,has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)


        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(new_reference_tree_copy,tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)


    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp,"w+")
    output_tree_file  = open(output_tree_fp,"w+")
    output_reference_tree_file  = open(output_reference_tree_fp,"w+")


    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str,lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()


    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',') 

    
    #create output directory
    make_output_dir(opts.output_dir)
    
    #file_name_field_order={'file_type':0,"prediction_method":1,\
    #  "weighting_method":2,"holdout_method":3,"distance":4,"organism":5}
    
    #Construct a dict from user specified field order
    file_name_field_order = {}
    for i,field in enumerate(opts.field_order.split(',')):
        file_name_field_order[field]=i
        if opts.verbose:
            print "Assuming file names are in this order:",file_name_field_order

    for k in pool_by:
        #Check that we're only pooling by values that exist 
        if k not in file_name_field_order.keys():
            err_text=\
              "Bad value for option '--pool_by'.  Can't pool by '%s'.   Valid categories are: %s" %(k,\
              ",".join(file_name_field_order.keys()))
            raise ValueError(err_text)
    
    if opts.verbose:
        print "Pooling results by:",pool_by
    
    
    roc_success_criteria = ['binary','exact','int_exact']

    scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\
      evaluate_test_dataset_dir(opts.trait_table_dir,\
      opts.exp_trait_table_dir,file_name_delimiter="--",\
      file_name_field_order=file_name_field_order,pool_by=pool_by,\
      roc_success_criteria=roc_success_criteria,verbose=opts.verbose)

    #Output scatter data
    
    output_fp = join(opts.output_dir,'evaluation_scatter_data.tab')
    if opts.verbose:
        print "Writing scatter plot data to:",output_fp
    file_lines = scatter_lines
    
    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output correlation data
    
    output_fp = join(opts.output_dir,'evaluation_correlation_data.tab')
    
    if opts.verbose:
        print "Writing correlation data to:",output_fp
    
    file_lines = correlation_lines
    
    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output raw ROC plot data
    if opts.verbose:
        print "Writing ROC data..."
    for c in roc_result_lines.keys(): 
        output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c)
        if opts.verbose:
            print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp)
        file_lines = roc_result_lines[c]
    
        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()

    #Output summary ROC AUC data
    if opts.verbose:
        print "Writing ROC AUC data..."
    
    for c in roc_auc_lines.keys(): 
        output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c)
        file_lines = roc_auc_lines[c]
    
        if opts.verbose:
            print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp)
        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table,"U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))
   
    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests 
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
       print "Ensuring tree and trait table labels are formatted consistently..."
   
    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)
    
    fix_tree_labels(tree,label_conversion_fns)
    
    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    
    if opts.limit_to_tips:
        
        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]
     
    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False
    
    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)
    
    if opts.verbose:
        print "Writing files for test  datasets"
    
    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:    
        
        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips)
                continue


        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'"%tip_to_predict

        #Write tree
        base_name = "--".join(map(str,["test_tree",opts.method,curr_dist]))
        curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath
        
        #Write expected trait table
        base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict]))
                
        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits)+"\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(exp_trait_table_lines))
        f.close()
        
        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence 
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename,"U"))
        fields = [f for f in fields] #converts generator to list    
        
        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")
       
        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)
        
        #Write BIOM format expected trait table
        base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict]))
        
        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")
                
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename=os.path.join(opts.output_dir,base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename
        
        f=open(filename,"w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

       
        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields])
        
        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict]))
        filename=os.path.join(opts.output_dir,base_name)
        
        if opts.verbose:
            print "Writing test trait table to:", filename
        
        f=open(filename,"w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
Exemple #8
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',')


    #create output directory
    make_output_dir(opts.output_dir)

    #Construct a dict from user specified field order
    file_name_field_order = {}
    for i,field in enumerate(opts.field_order.split(',')):
        file_name_field_order[field]=i
        if opts.verbose:
            print "Assuming file names are in this order:",file_name_field_order

    for k in pool_by:
        #Check that we're only pooling by values that exist
        if k not in file_name_field_order.keys():
            err_text=\
              "Bad value for option '--pool_by'.  Can't pool by '%s'.   Valid categories are: %s" %(k,\
              ",".join(file_name_field_order.keys()))
            raise ValueError(err_text)

    if opts.verbose:
        print "Pooling results by:",pool_by

    roc_success_criteria = ['binary','exact','int_exact']

    scatter_lines,correlation_lines,roc_result_lines,roc_auc_lines =\
      evaluate_test_dataset_dir(opts.trait_table_dir,\
      opts.exp_trait_table_dir,file_name_delimiter="--",\
      file_name_field_order=file_name_field_order,pool_by=pool_by,\
      roc_success_criteria=roc_success_criteria,verbose=opts.verbose)

    #Output scatter data

    output_fp = join(opts.output_dir,'evaluation_scatter_data.tab')
    if opts.verbose:
        print "Writing scatter plot data to:",output_fp
    file_lines = scatter_lines

    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output correlation data

    output_fp = join(opts.output_dir,'evaluation_correlation_data.tab')

    if opts.verbose:
        print "Writing correlation data to:",output_fp

    file_lines = correlation_lines

    f = open(output_fp,"w+")
    f.writelines(file_lines)
    f.close()

    #Output raw ROC plot data
    if opts.verbose:
        print "Writing ROC data..."
    for c in roc_result_lines.keys():
        output_fp = join(opts.output_dir,'evaluation_roc_data_%s.tab' %c)
        if opts.verbose:
            print "Outputting ROC data for success criterion %s to: %s" %(c,output_fp)
        file_lines = roc_result_lines[c]

        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()

    #Output summary ROC AUC data
    if opts.verbose:
        print "Writing ROC AUC data..."

    for c in roc_auc_lines.keys():
        output_fp = join(opts.output_dir,'evaluation_roc_auc_data_%s.tab' %c)
        file_lines = roc_auc_lines[c]

        if opts.verbose:
            print "Outputting ROC AUC data for success criterion %s to: %s" %(c,output_fp)
        f = open(output_fp,"w+")
        f.writelines(file_lines)
        f.close()
Exemple #9
0
def main():
    """Generate test trees given parameters"""
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.verbose:
        print "Loading trait table..."
    input_trait_table = open(opts.input_trait_table, "U")

    if opts.verbose:
        print "Loading tree..."
    #PicrustNode seems to run into very slow/memory intentsive perfromance...
    #tree = DndParser(open(opts.input_tree),constructor=PicrustNode)
    tree = DndParser(open(opts.input_tree))

    if opts.verbose:
        print "Parsing trait table..."
    #Find which taxa are to be used in tests
    #(by default trait table taxa)
    trait_table_header,trait_table_fields = \
            parse_trait_table(input_trait_table)

    if opts.verbose:
        print "Ensuring tree and trait table labels are formatted consistently..."

    label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose)

    fix_tree_labels(tree, label_conversion_fns)

    trait_table_fields = convert_trait_table_entries(trait_table_fields,\
      value_conversion_fns = [],\
      label_conversion_fns = label_conversion_fns)

    trait_table_fields = [t for t in trait_table_fields]
    print "Number of trait table fields with single quotes:",\
     len([t for t in trait_table_fields if "'" in t[0]])

    if opts.verbose:
        print "Making output directory..."
    make_output_dir(opts.output_dir)

    if opts.limit_to_tips:

        included_tips = opts.limit_to_tips.split(",")
        if opts.verbose:
            print "Limiting test datasets to %i tips: %s" % (
                len(included_tips), included_tips)
    else:
        included_tips = False

    method_fns =\
      {"exclude_tips_by_distance":\
         make_distance_based_exclusion_fn,\
       "randomize_tip_labels_by_distance":\
         make_distance_based_tip_label_randomizer
       }

    test_fn_factory = method_fns[opts.method]

    if opts.verbose:
        print "Setting tree modification method to:", opts.method
        print "(%s)" % test_fn_factory.__doc__

    modify_tree = True
    if opts.suppress_tree_modification:
        if opts.verbose:
            print "Suppressing modification of tree when making test datasets"
        modify_tree = False

    if opts.verbose:
        print "Starting generation of test datsets"

    test_datasets = \
      yield_genome_test_data_by_distance(tree,trait_table_fields,\
      test_fn_factory,min_dist = opts.min_dist,\
      max_dist=opts.max_dist,increment=opts.dist_increment,\
      modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose)

    if opts.verbose:
        print "Writing files for test  datasets"

    for curr_dist,test_tree,tip_to_predict,\
        expected_traits,test_trait_table_fields in test_datasets:

        if included_tips is not False:
            if tip_to_predict not in included_tips:
                if opts.verbose:
                    print "Skipping tip %s: limiting to tip(s): %s" % (
                        tip_to_predict, included_tips)
                continue

        #Make a safe version of tip to predict
        # So odd characters like | don't mess up OS

        safe_tip_to_predict = "'%s'" % tip_to_predict

        #Write tree
        base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist]))
        curr_filepath = write_tree(opts.output_dir, base_name, test_tree,
                                   safe_tip_to_predict)
        if opts.verbose:
            print "Wrote test tree to: %s" % curr_filepath

        #Write expected trait table
        base_name = "--".join(
            map(str,
                ["exp_traits", opts.method, curr_dist, safe_tip_to_predict]))

        exp_trait_table_lines = [trait_table_header]
        exp_trait_table_lines.append("\t".join(expected_traits) + "\n")
        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing expected trait table to:", filename

        f = open(filename, "w")
        f.write("".join(exp_trait_table_lines))
        f.close()

        #Output a transposed, BIOM format expectation table for comparison with predict_traits output

        #NOTE: this is a clumsy way of getting the translated trait table
        # but more elegant, direct methods (directly feeding data to biom's table_factory)
        # weren't working for me readily.   In the future, we should streamline this process
        # Leaving as is for now since this code is mostly for developers so speed/elegence
        # are probably not essential here.

        #Let the hackishness begin

        #Reload the tab-delimited trait table
        header, fields = parse_trait_table(open(filename, "U"))
        fields = [f for f in fields]  #converts generator to list

        #Transpose table for .BIOM format so that Observation ids are KOs
        transposed_header, transposed_trait_table_lines =\
          transpose_trait_table_fields(fields,header,\
          id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t")

        #Eliminate newline in header
        trans_trait_table_lines = [transposed_header.strip()]
        trans_trait_table_lines.extend(
            ["\t".join(r) for r in transposed_trait_table_lines])
        trans_trait_table = '\n'.join(trans_trait_table_lines)

        #Write BIOM format expected trait table
        base_name = "--".join(
            map(str, [
                "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict
            ]))

        expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\
            table_format = "tab-delimited")

        #print "Expected_trait_table_lines:",exp_trait_table_lines
        filename = os.path.join(opts.output_dir, base_name)
        if opts.verbose:
            print "Writing BIOM-format expected trait table to:", filename

        f = open(filename, "w")
        f.write(format_biom_table(expected_biom_table))
        f.close()

        #Write test trait table
        test_trait_table_fields = test_trait_table_fields
        if expected_traits in test_trait_table_fields:
            test_trait_table_fields.remove(expected_traits)
        test_trait_table_lines = [trait_table_header]
        test_trait_table_lines.extend(
            ["\t".join(r) + "\n" for r in test_trait_table_fields])

        #print "Test_trait_table_lines:",test_trait_table_lines
        base_name = "--".join(
            map(str, [
                "test_trait_table", opts.method, curr_dist, safe_tip_to_predict
            ]))
        filename = os.path.join(opts.output_dir, base_name)

        if opts.verbose:
            print "Writing test trait table to:", filename

        f = open(filename, "w")
        f.write("".join(test_trait_table_lines))
        f.close()

    if opts.verbose:
        print "Done generating test datasets"
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir='jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(),'scripts','predict_traits.py')

    if(opts.parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(opts.parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(opts.parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(opts.verbose):
        print "Loading tree..."
        
    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]
    
    if(opts.verbose):
        print "Total number of possible tips to predict: {0}".format(len(all_tips))

    created_tmp_files=[]
    output_files={}
    output_files['counts']=[]
    if opts.reconstruction_confidence:
        output_files['variances']=[]
        output_files['upper_CI']=[]
        output_files['lower_CI']=[]

    if opts.already_calculated:
        all_tips=get_tips_not_in_precalc(all_tips,opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(opts.verbose):
        print "Creating temporary input files in: ",tmp_dir
    
    num_tips_per_job=1000
    for tips_to_predict in [all_tips[i:i+num_tips_per_job] for i in range(0, len(all_tips), num_tips_per_job)]:
        
        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str=','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base,extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base+"_variances.tab")
            output_files['upper_CI'].append(outfile_base+"_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base+"_lower_CI.tab")
            
            #create the job command
            cmd= "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, opts.reconstruction_confidence, tip_to_predict_str, tmp_output_fp)

        else:
            cmd= "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(script_fp, opts.observed_trait_table, opts.tree, opts.reconstructed_trait_table, tip_to_predict_str, tmp_output_fp)
            

        #NOTE: Calculating NSTI this way is convenient, 
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd=cmd+" -a"

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if(opts.verbose):
        print "Launching parallel jobs."
        
    #run the job command
    job_prefix='picrust'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=opts.num_jobs,delay=opts.delay)

    if(opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if(opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base,extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
       #Combine output files
        if opts.verbose:
            print "Combining all output files for "+ predict_type

        combined_predictions=combine_predict_trait_output(output_files[predict_type])
        
        if opts.verbose:
            print "Writing combined file for "+predict_type

        if predict_type == 'counts':
        #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table,'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table,'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base+"_"+predict_type+".biom",'w').write(format_biom_table(convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base+"_"+predict_type+".tab",'w').write(combined_predictions)    
        
    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)
def main():

    # Parse input to get parameters
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    tree_file = opts.input_tree
    trait_table_fp = opts.input_trait_table
    verbose = opts.verbose

    #Set output base file names
    trait_table_base = 'trait_table.tab'
    pruned_tree_base = 'pruned_tree.newick'
    reference_tree_base = 'reference_tree.newick'

    output_dir = make_output_dir(opts.output_dir, strict=False)
    output_table_fp = join(output_dir, trait_table_base)
    output_tree_fp = join(output_dir, pruned_tree_base)
    output_reference_tree_fp = join(output_dir, reference_tree_base)

    #Handle parameters with more complex defaults
    delimiter_map = {"space": " ", "tab": "\t", "comma": ","}
    input_delimiter = delimiter_map[opts.input_table_delimiter]
    output_delimiter = delimiter_map[opts.output_table_delimiter]

    if verbose:
        print "Running with options:"
        print "\t%s:%s" % ("Tree file", tree_file)
        print "\t%s:%s" % ("Trait table", trait_table_fp)
        print "\t%s:%s" % ("Output tree", output_tree_fp)
        print "\t%s:%s" % ("Output reference tree", output_reference_tree_fp)
        print "\t%s:%s" % ("Output trait table", output_table_fp)
        print "\t%s:%s" % ("Add branch length to root",
                           opts.add_branch_length_to_root)
        print "\t%s:%s" % ("Convert to NEXUS?", opts.convert_to_nexus)
        print "\t%s:%s" % ("Input trait table delimiter",
                           opts.input_table_delimiter)
        print "\t%s:%s" % ("Output trait table delimiter",
                           opts.output_table_delimiter)

    # Begin reformatting

    root_name = "root"
    #format_for_bayestraits = True
    #TODO: this will become a new function in the bayestraits app controller
    #if format_for_bayestraits:
    #    convert_to_nexus = True
    #    convert_to_bifurcating = True
    #    filter_table_by_tree_tips = True
    #    filter_tree_by_table_entries = True
    #    enforce_min_branch_length = True
    #    convert_trait_floats_to_ints = True

    if opts.no_minimum_branch_length:
        min_branch_length = None
    else:
        min_branch_length = 0.0001

    #Load inputs
    if verbose:
        print "Loading tree...."

    input_tree = DndParser(open(tree_file))
    #input_tree =DndParser(open(tree_file), constructor=PicrustNode)

    #input_tree = load_picrust_tree(opts.input_tree,opts.verbose)

    if verbose:
        print "Loading trait table..."
    trait_table = open(trait_table_fp, "U")
    trait_table_lines = trait_table.readlines()
    if not trait_table_lines:
        raise IOError(
            "No lines could be loaded from file %s. Please check the input file."
            % trait_table_fp)

    #Get id mappings from mapping file
    if opts.tree_to_trait_mapping:
        if verbose:
            print "Loading tree to trait table mapping file..."

        mapping_file = open(opts.tree_to_trait_mapping, "U")

        trait_to_tree_mapping =\
          make_id_mapping_dict(parse_id_mapping_file(mapping_file))

    else:
        if verbose:
            print "No tree to trait mapping file specified.  Assuming tree tip names and trait table names will match exactly."
        trait_to_tree_mapping = None

    # Call reformatting function using specified parameters
    # to get reference tree
    if opts.verbose:
        print """**BUILDING REFERENCE TREE (without respect to trait table)**"""

    new_reference_tree, not_useful_trait_table_lines =\
      reformat_tree_and_trait_table(\
      tree=input_tree,\
      trait_table_lines = [],\
      trait_to_tree_mapping = None,\
      input_trait_table_delimiter= None,\
      output_trait_table_delimiter= None,\
      filter_table_by_tree_tips=False,\
      convert_trait_floats_to_ints=False,\
      filter_tree_by_table_entries=False,\
      convert_to_bifurcating=True,\
      add_branch_length_to_root=False,\
      name_unnamed_nodes=True,\
      min_branch_length=min_branch_length,\
      verbose=opts.verbose)

    #Make a copy
    new_reference_tree_copy = new_reference_tree.deepcopy()

    if opts.verbose:
        print """**BUILDING PRUNED TREE AND TRAIT TABLE**"""
    # Call reformatting function using specified parameters
    new_tree, new_trait_table_lines = \
       reformat_tree_and_trait_table(tree=new_reference_tree_copy,\
       trait_table_lines = trait_table_lines,\
       trait_to_tree_mapping = trait_to_tree_mapping,\
       input_trait_table_delimiter= input_delimiter,\
       output_trait_table_delimiter=output_delimiter,\
       filter_table_by_tree_tips=True,\
       convert_trait_floats_to_ints=False,\
       filter_tree_by_table_entries=True,\
       convert_to_bifurcating=False,\
       add_branch_length_to_root=False,\
       name_unnamed_nodes=False,\
       min_branch_length=min_branch_length,\
       verbose=opts.verbose)

    #Alter reference tree to only contain tips in OTU table (and of course trait table)
    if opts.limit_tree_to_otus_fp:
        if opts.verbose:
            print "Pruning reference tree to contain only tips in OTU table (and trait table)...."
        otu_table = open(opts.limit_tree_to_otus_fp, "U")
        otu_table_lines = otu_table.readlines()
        header_line, otu_table_fields = parse_trait_table(
            otu_table_lines, delimiter=input_delimiter, has_header=False)
        header_line,trait_table_fields =\
         parse_trait_table(new_trait_table_lines,delimiter = input_delimiter)

        tips_to_keep = list(otu_table_fields) + list(trait_table_fields)
        tips_to_keep_in_tree = filter_table_by_presence_in_tree(
            new_reference_tree_copy, tips_to_keep)
        new_reference_tree = filter_tree_tips_by_presence_in_table(new_reference_tree_copy,\
          tips_to_keep_in_tree,verbose=opts.verbose)

    if opts.verbose:
        print "Almost finished. Writing trees and trait table to files..."
    #Write results to files

    # Open output files
    output_trait_table_file = open(output_table_fp, "w+")
    output_tree_file = open(output_tree_fp, "w+")
    output_reference_tree_file = open(output_reference_tree_fp, "w+")

    #Output trait table file

    if opts.verbose:
        print "Writing trait table to:", output_table_fp

    output_trait_table_file.write("\n".join(new_trait_table_lines))
    trait_table.close()
    output_trait_table_file.close()

    #Output tree file
    if opts.verbose:
        print "Writing pruned tree to:", output_tree_fp

    if opts.convert_to_nexus is True:
        lines = nexus_lines_from_tree(new_tree)
        output_tree_file.write("\n".join(map(str, lines)))
    else:
        output_tree_file.write(new_tree.getNewick(with_distances=True))

    output_tree_file.close()

    if opts.verbose:
        print "Writing reference tree to:", output_reference_tree_fp
    #Output reference tree file
    output_reference_tree_file.write(
        new_reference_tree.getNewick(with_distances=True))
    output_reference_tree_file.close()
Exemple #12
0
def main():
    option_parser, opts, args =\
                   parse_command_line_parameters(**script_info)

    tmp_dir = 'jobs/'
    make_output_dir(tmp_dir)

    #Run the jobs
    script_fp = join(get_picrust_project_dir(), 'scripts', 'predict_traits.py')

    if (opts.parallel_method == 'sge'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_sge.py')
    elif (opts.parallel_method == 'multithreaded'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs.py')
    elif (opts.parallel_method == 'torque'):
        cluster_jobs_fp = join(get_picrust_project_dir(), 'scripts',
                               'start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if (opts.verbose):
        print "Loading tree..."

    tree = load_picrust_tree(opts.tree, opts.verbose)

    all_tips = [tip.Name for tip in tree.tips()]

    if (opts.verbose):
        print "Total number of possible tips to predict: {0}".format(
            len(all_tips))

    created_tmp_files = []
    output_files = {}
    output_files['counts'] = []
    if opts.reconstruction_confidence:
        output_files['variances'] = []
        output_files['upper_CI'] = []
        output_files['lower_CI'] = []

    if opts.already_calculated:
        all_tips = get_tips_not_in_precalc(all_tips, opts.already_calculated)
        if opts.verbose:
            print "After taking into account tips already predicted, the number of tips left to predict is: {0}".format(
                len(all_tips))

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='jobs_')
    jobs = open(jobs_fp, 'w')
    created_tmp_files.append(jobs_fp)

    if (opts.verbose):
        print "Creating temporary input files in: ", tmp_dir

    num_tips_per_job = 1000
    for tips_to_predict in [
            all_tips[i:i + num_tips_per_job]
            for i in range(0, len(all_tips), num_tips_per_job)
    ]:

        #create tmp output files
        tmp_output_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='out_predict_traits_')
        output_files['counts'].append(tmp_output_fp)

        tip_to_predict_str = ','.join(list(tips_to_predict))

        if opts.reconstruction_confidence:
            outfile_base, extension = splitext(tmp_output_fp)
            output_files['variances'].append(outfile_base + "_variances.tab")
            output_files['upper_CI'].append(outfile_base + "_upper_CI.tab")
            output_files['lower_CI'].append(outfile_base + "_lower_CI.tab")

            #create the job command
            cmd = "{0} -i {1} -t {2} -r {3} -c {4} -g {5} -o {6}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, opts.reconstruction_confidence,
                tip_to_predict_str, tmp_output_fp)

        else:
            cmd = "{0} -i {1} -t {2} -r {3} -g {4} -o {5}".format(
                script_fp, opts.observed_trait_table, opts.tree,
                opts.reconstructed_trait_table, tip_to_predict_str,
                tmp_output_fp)

        #NOTE: Calculating NSTI this way is convenient,
        #but would probably be faster if we ran the NSTI calculation separate (using the --output_accuracy_metrics_only) and added it to the output file later on.
        if opts.calculate_accuracy_metrics:
            cmd = cmd + " -a"

        #add job command to the the jobs file
        jobs.write(cmd + "\n")

    jobs.close()

    #add all output files to tmp list (used later for deletion)
    for predict_type in output_files:
        created_tmp_files.extend(output_files[predict_type])
    if (opts.verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix = 'picrust'
    submit_jobs(cluster_jobs_fp,
                jobs_fp,
                job_prefix,
                num_jobs=opts.num_jobs,
                delay=opts.delay)

    if (opts.verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files['counts'])

    if (opts.verbose):
        print "Jobs are done running."

    make_output_dir_for_file(opts.output_trait_table)
    outfile_base, extension = splitext(opts.output_trait_table)
    for predict_type in sorted(output_files):
        #Combine output files
        if opts.verbose:
            print "Combining all output files for " + predict_type

        combined_predictions = combine_predict_trait_output(
            output_files[predict_type])

        if opts.verbose:
            print "Writing combined file for " + predict_type

        if predict_type == 'counts':
            #Output in whatever format the user wants
            if opts.output_precalc_file_in_biom:
                open(opts.output_trait_table, 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(opts.output_trait_table, 'w').write(combined_predictions)
        else:
            if opts.output_precalc_file_in_biom:
                open(outfile_base + "_" + predict_type + ".biom", 'w').write(
                    format_biom_table(
                        convert_precalc_to_biom(combined_predictions)))
            else:
                open(outfile_base + "_" + predict_type + ".tab",
                     'w').write(combined_predictions)

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)