Ejemplo n.º 1
0
def pool_test_dataset_dir(obs_dir_fp,exp_dir_fp,file_name_delimiter="--",\
        file_name_field_order=\
        {'file_type':0,"prediction_method":1,"weighting_method":2,"holdout_method":3,\
          "distance":4,"organism":5},strict=False, verbose=True,pool_by=['distance']):
    """Retrun pooled control &  evaluation results from the given directories
    
    obs_dir_fp -- directory containing PICRUST-predicted genomes.   These MUST start with
    'predict_traits', and must contain the values specified in file_name_field_order,\
    separated by the delimiter given in file_name_delimiter.  For example:

    predict_traits--exclude_tips_by_distance--0.87--'NC_000913|646311926'

    exp_dir_fp -- as obs_dir_fp above, but expectation file names (usually sequenced genomes
    with known gene content) must start with exp_biom_traits

    file_name_delimiter -- the delimiter that separates metadata stored in the filename
    
    NOTE: technically this isn't the best way of doing things.  We may want at some point
    to revisit this setup and store metadata about each comparison in a separate file.  But
    storing in the filename is convenient for our initial analysis.

    file_name_field_order -- the order of the required metadata fields in the filename.
    Required fields are file_type,method,distance,and organism

    pool_by -- if passed, concatenate traits from each trial that is identical in this category.  e.g. pool_by 'distance' will pool traits across individual test genomes with the same holdout distance.
    
    The method assumes that for each file type in the observed directory, a paired file
    is also found in the exp_dir with similar method, distance, and organism, but a varied 
    file type (test_tree, test_trait_table)

    
    Process:
    1. Search test directory for all gene predictions in the correct format
    2. For each, find the corresponding expected trait table in the expectation file
    3. Pool by specified pool_by values
    4. Return dicts of pooled observation,expectation values
    """
    trials = defaultdict(list)
    #We'll want a quick unzip fn for converting points to trials
    #TODO: separate out into a 'get_paired_data_from_dirs' function

    pooled_observations = {}
    pooled_expectations = {}
    pairs = iter_prediction_expectation_pairs(obs_dir_fp,exp_dir_fp,file_name_field_order,file_name_delimiter,verbose=verbose)
    file_number = 0
    for obs_table,exp_table,filename in pairs:
        #print "analyzing filename:",filename 
        filename_metadata= get_metadata_from_filename(filename,file_name_field_order,\
          file_name_delimiter,verbose=verbose)
        
        #base_tag =  '%s\t%s\t' %(filename_metadata['holdout_method'],filename_metadata['prediction_method'])
        #tags = [base_tag+'all_results']
        if 'file_type' in pool_by:
            pool_by.remove('file_type') #we do this manually at the end
        combined_tag = ['all']*len(file_name_field_order.keys())
        for field in file_name_field_order.keys():
          #print combined_tag
          #print file_name_field_order
          idx = file_name_field_order[field]
          #print idx
          if field in pool_by:
              combined_tag[idx] = filename_metadata[field]
        tags=[file_name_delimiter.join(combined_tag)]
       
        if verbose:
          print "Pooling by:", pool_by
          print "Combined tags:",tags
        
        pooled_observations,pooled_expectations =\
        update_pooled_data(obs_table,exp_table,tags,pooled_observations,\
        pooled_expectations,str(file_number),verbose=verbose)
        file_number += 1

    return pooled_observations,pooled_expectations 
Ejemplo n.º 2
0
                    print "Missing expectation file....skipping!"
                continue
        base_tag =  '%s\t%s\t' %(holdout_method,prediction_method)
        tags = [base_tag+'all_results']
        combined_tag = base_tag +\
                "\t".join([str(field)+"_"+str(filename_components[file_name_field_order[field]]) for field in pool_by])
        tags.append(combined_tag)
        
        #if verbose:
        #  print "Pooling by:", pool_by
        #  print "Combined tags:",tags
        
        #TODO: abstract out pooling into its own function
        non_pooled_fields = [filename_components.get(file_name_field_order[k],None) for k in file_name_field_order.keys() if k not in pool_by]
        pooled_observations,pooled_expectations =\
                update_pooled_data(obs_table,exp_table,tags,pooled_observations,\
          pooled_expectations,str(file_number),verbose=verbose)


    #if verbose:
    #    for tag in pooled_observations.keys():
    #        print "Merged obs biom:", pooled_observations[tag]
    #        print "\nMedged *exp* biom:", pooled_expectations[tag]
    return run_accuracy_calculations_on_pooled_data(pooled_observations,\
      pooled_expectations,roc_success_criteria=roc_success_criteria,verbose=verbose)

def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',') 

    
Ejemplo n.º 3
0
def pool_test_dataset_dir(obs_dir_fp,exp_dir_fp,file_name_delimiter="--",\
        file_name_field_order=\
        {'file_type':0,"prediction_method":1,"weighting_method":2,"holdout_method":3,\
          "distance":4,"organism":5},strict=False, verbose=True,pool_by=['distance']):
    """Retrun pooled control &  evaluation results from the given directories

    obs_dir_fp -- directory containing PICRUST-predicted genomes.   These MUST start with
    'predict_traits', and must contain the values specified in file_name_field_order,\
    separated by the delimiter given in file_name_delimiter.  For example:

    predict_traits--exclude_tips_by_distance--0.87--'NC_000913|646311926'

    exp_dir_fp -- as obs_dir_fp above, but expectation file names (usually sequenced genomes
    with known gene content) must start with exp_biom_traits

    file_name_delimiter -- the delimiter that separates metadata stored in the filename

    NOTE: technically this isn't the best way of doing things.  We may want at some point
    to revisit this setup and store metadata about each comparison in a separate file.  But
    storing in the filename is convenient for our initial analysis.

    file_name_field_order -- the order of the required metadata fields in the filename.
    Required fields are file_type,method,distance,and organism

    pool_by -- if passed, concatenate traits from each trial that is identical in this category.  e.g. pool_by 'distance' will pool traits across individual test genomes with the same holdout distance.

    The method assumes that for each file type in the observed directory, a paired file
    is also found in the exp_dir with similar method, distance, and organism, but a varied
    file type (test_tree, test_trait_table)


    Process:
    1. Search test directory for all gene predictions in the correct format
    2. For each, find the corresponding expected trait table in the expectation file
    3. Pool by specified pool_by values
    4. Return dicts of pooled observation,expectation values
    """
    #We'll want a quick unzip fn for converting points to trials
    #TODO: separate out into a 'get_paired_data_from_dirs' function

    pooled_observations = {}
    pooled_expectations = {}
    pairs = iter_prediction_expectation_pairs(obs_dir_fp,
                                              exp_dir_fp,
                                              file_name_field_order,
                                              file_name_delimiter,
                                              verbose=verbose)
    file_number = 0
    for obs_table, exp_table, filename in pairs:
        #print "analyzing filename:",filename
        filename_metadata= get_metadata_from_filename(filename,file_name_field_order,\
          file_name_delimiter,verbose=verbose)

        #base_tag =  '%s\t%s\t' %(filename_metadata['holdout_method'],filename_metadata['prediction_method'])
        #tags = [base_tag+'all_results']
        if 'file_type' in pool_by:
            pool_by.remove('file_type')  #we do this manually at the end
        combined_tag = ['all'] * len(file_name_field_order.keys())
        for field in file_name_field_order.keys():
            #print combined_tag
            #print file_name_field_order
            idx = file_name_field_order[field]
            #print idx
            if field in pool_by:
                combined_tag[idx] = filename_metadata[field]
        tags = [file_name_delimiter.join(combined_tag)]

        if verbose:
            print "Pooling by:", pool_by
            print "Combined tags:", tags

        pooled_observations,pooled_expectations =\
        update_pooled_data(obs_table,exp_table,tags,pooled_observations,\
        pooled_expectations,str(file_number),verbose=verbose)
        file_number += 1

    return pooled_observations, pooled_expectations
Ejemplo n.º 4
0
            if strict:
                raise IOError(e)
            else:
                if verbose:
                    print "Missing expectation file....skipping!"
                continue
        base_tag =  '%s\t%s\t' %(holdout_method,prediction_method)
        tags = [base_tag+'all_results']
        combined_tag = base_tag +\
                "\t".join([str(field)+"_"+str(filename_components[file_name_field_order[field]]) for field in pool_by])
        tags.append(combined_tag)

        #TODO: abstract out pooling into its own function
        non_pooled_fields = [filename_components.get(file_name_field_order[k],None) for k in file_name_field_order.keys() if k not in pool_by]
        pooled_observations,pooled_expectations =\
                update_pooled_data(obs_table,exp_table,tags,pooled_observations,\
          pooled_expectations,str(file_number),verbose=verbose)

    return run_accuracy_calculations_on_pooled_data(pooled_observations,\
      pooled_expectations,roc_success_criteria=roc_success_criteria,verbose=verbose)

def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    pool_by = opts.pool_by.split(',')


    #create output directory
    make_output_dir(opts.output_dir)

    #Construct a dict from user specified field order
    file_name_field_order = {}