def test_procrustes_monte_carlo(self):
        """ sanity test of procrustes_monte_carlo wrapper function

          THIS TEST MAY RANDOMLY FAIL BECAUSE IT IS BASED ON TESTING
          RANDOM PERMUTATIONS, BUT THAT SHOULD BE RARE.

        """

        def shuffle_f(coords):
            """ A fake shuffle function -- used to avoid random test failures

                returns a re-ordered coords2
            """
            return array(
                [
                    [-0.16713783, 0.22321481, 0.33766418, 0.22785083, -0.23830446, -0.18754852],
                    [-0.14864343, 0.07290181, -0.06250315, 0.03201972, -0.0966749, -0.10337987],
                    [0.35725269, -0.00761567, 0.09044279, -0.21006839, -0.01355589, -0.04590791],
                    [0.26535811, 0.09772598, 0.04339214, -0.21014987, 0.14089095, -0.10261849],
                ]
            )

        actual = procrustes_monte_carlo(StringIO(pcoa1_f), StringIO(pcoa2_f), trials=100, shuffle_f=shuffle_f)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        expected_actual_m2 = 0.0211
        expected_len_trial_m2 = 100
        expected_count_better = 0
        expected_p_value = 0.0
        self.assertAlmostEqual(actual[0], expected_actual_m2, 3)
        self.assertEqual(len(actual[1]), expected_len_trial_m2)
        self.assertEqual(actual[2], expected_count_better)
        self.assertEqual(actual[3], expected_p_value)
    def test_procrustes_monte_carlo(self):
        """ sanity test of procrustes_monte_carlo wrapper function
          
          THIS TEST MAY RANDOMLY FAIL BECAUSE IT IS BASED ON TESTING 
          RANDOM PERMUTATIONS, BUT THAT SHOULD BE RARE.
        
        """
        def shuffle_f(coords):
            """ A fake shuffle function -- used to avoid random test failures 
                
                returns a re-ordered coords2
            """
            return array(\
             [[-0.16713783,  0.22321481,  0.33766418,  0.22785083, -0.23830446, -0.18754852],
              [-0.14864343,  0.07290181, -0.06250315,  0.03201972, -0.0966749 , -0.10337987],
              [ 0.35725269, -0.00761567,  0.09044279, -0.21006839, -0.01355589, -0.04590791],
              [ 0.26535811,  0.09772598,  0.04339214, -0.21014987,  0.14089095, -0.10261849]])

        actual = procrustes_monte_carlo(self.pcoa1_f,
                                        self.pcoa2_f,
                                        trials=100,
                                        shuffle_f=shuffle_f)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        expected_actual_m2 = 0.0211
        expected_len_trial_m2 = 100
        expected_count_better = 0
        expected_p_value = 0.0
        self.assertAlmostEqual(actual[0], expected_actual_m2, 3)
        self.assertEqual(len(actual[1]), expected_len_trial_m2)
        self.assertEqual(actual[2], expected_count_better)
        self.assertEqual(actual[3], expected_p_value)
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999):
    """ Compute Procrustes M2 and p-values for a set of results
    
        result_tables: 2d list of tables to be compared to expected tables, 
         where the data in the inner list is:
          [dataset_id, reference_database_id, method_id, 
           parameter_combination_id, table_fp]
        expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal
         coordinate matrices, for the expected result coordinate matrices
        taxonomy_level: level to compute results
    """
    ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is
    ### in order here. *ALMOST refers to changes to parser and variable names since expected
    ### is a pc matrix here.

    for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
        ## parse the expected table (unless taxonomy_level is specified, this should be
        ## collapsed on level 6 taxonomy)
        try:
            expected_pc_fp = expected_pc_lookup[dataset_id][reference_id]
        except KeyError:
            raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)

        ## parse the actual table and collapse it at the specified taxonomic level
        try:
            actual_table = parse_biom_table(open(actual_table_fp, "U"))
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
        collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
        actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy)
        ### End code copied directly from compute_prfs.

        # Next block of code, how do I hate thee? Let me count the ways...
        # (1) dist_bray_curtis doesn't take a BIOM Table object
        # (2) pcoa takes a qiime-formatted distance matrix as a list of lines
        # (3) pcoa return a qiime-formatted pc matrix
        # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple
        #     times, so we actually *need* those the pcs that get passed in to be
        #     lists of lines
        dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()]))
        formatted_dm = format_distance_matrix(actual_table.SampleIds, dm)
        actual_pc = pcoa(formatted_dm.split("\n")).split("\n")
        expected_pc = list(open(expected_pc_fp, "U"))

        ## run Procrustes analysis with monte carlo simulation
        actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo(
            expected_pc,
            actual_pc,
            trials=random_trials,
            max_dimensions=num_dimensions,
            sample_id_map=None,
            trial_output_dir=None,
        )

        yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
 def test_procrustes_monte_carlo(self):
     """ sanity test of procrustes_monte_carlo wrapper function"""
     actual = procrustes_monte_carlo(self.pcoa1_f,self.pcoa2_f,trials=10)
     # just some sanity checks as the individual componenets are 
     # already tested -- these are based on looking at the output of the
     # run, and testing to ensure that it hasn't changed
     expected_actual_m2 = 0.0211
     expected_len_trial_m2 = 10
     expected_count_better = 0
     expected_p_value = 0.0
     self.assertAlmostEqual(actual[0],expected_actual_m2,3)
     self.assertEqual(len(actual[1]),expected_len_trial_m2)
     self.assertEqual(actual[2],expected_count_better)
     self.assertEqual(actual[3],expected_p_value)
 def test_procrustes_monte_carlo(self):
     """ sanity test of procrustes_monte_carlo wrapper function"""
     actual = procrustes_monte_carlo(self.pcoa1_f, self.pcoa2_f, trials=10)
     # just some sanity checks as the individual componenets are
     # already tested -- these are based on looking at the output of the
     # run, and testing to ensure that it hasn't changed
     expected_actual_m2 = 0.0211
     expected_len_trial_m2 = 10
     expected_count_better = 0
     expected_p_value = 0.0
     self.assertAlmostEqual(actual[0], expected_actual_m2, 3)
     self.assertEqual(len(actual[1]), expected_len_trial_m2)
     self.assertEqual(actual[2], expected_count_better)
     self.assertEqual(actual[3], expected_p_value)
Example #6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials

    if random_trials is not None and random_trials < 10:
        option_parser.error(
            'Must perform >= 10 trails for Monte Carlo analysis.')

    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
        option_parser.error('If providing sample id maps, there must be '
                            'exactly one fewer sample id maps than input '
                            'coordinate matrices.')

    if not exists(output_dir):
        makedirs(output_dir)

    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir, 'procrustes_results.txt')
    summary_file_lines = [
        '#FP1\tFP2\tNum included dimensions\tMonte Carlo '
        'p-value\tCount better\tM^2',
        '#Warning: p-values in this file are NOT currently '
        'adjusted for multiple comparisons.'
    ]

    for i, query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(
            output_dir,
            '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(
            output_dir,
            '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1))

        if sample_id_map_fps:
            with open(sample_id_map_fps[i], "U") as f:
                sample_id_map = dict([
                    (k, v[0]) for k, v in fields_to_dict(f).iteritems()
                ])
        else:
            sample_id_map = None

        with open(reference_input_fp, 'U') as ref_in_f:
            with open(query_input_fp, 'U') as query_in_f:
                transf_coords1, transf_coords2, m_squared, rand_coords2 =\
                    get_procrustes_results(ref_in_f, query_in_f,
                                           sample_id_map=sample_id_map,
                                           randomize=False,
                                           max_dimensions=num_dimensions)

        transf_coords1.write(output_matrix1_fp)
        transf_coords2.write(output_matrix2_fp)

        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2)
            else:
                trial_output_dir = None
            coords_f1 = open(reference_input_fp, 'U')
            coords_f2 = open(query_input_fp, 'U')
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
                procrustes_monte_carlo(coords_f1,
                                       coords_f2,
                                       trials=random_trials,
                                       max_dimensions=num_dimensions,
                                       sample_id_map=sample_id_map,
                                       trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(
                mc_p_value, random_trials)
            summary_file_lines.append(
                '%s\t%s\t%s\t%s\t%d\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str,
                 mc_p_value_str, count_better, actual_m_squared))
        else:
            summary_file_lines.append(
                '%s\t%s\t%s\tNA\tNA\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    with open(output_summary_fp, 'w') as f:
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials
    
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
    
    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
       option_parser.error('If providing sample id maps, there must be exactly'
                           ' one fewer sample id maps than input coordinate'
                           ' matrices.')
    
    if not exists(output_dir): 
        makedirs(output_dir)
  
    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir,'procrustes_results.txt')
    summary_file_lines = \
     ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2',
      '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.']
    
    for i,query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(output_dir,
         '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(output_dir,\
         '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1))
        
        if sample_id_map_fps:
            sample_id_map = dict([(k,v[0]) \
             for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()])
        else:
            sample_id_map = None
        
        transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
          get_procrustes_results(open(reference_input_fp,'U'),\
                                 open(query_input_fp,'U'),\
                                 sample_id_map=sample_id_map,\
                                 randomize=False,
                                 max_dimensions=num_dimensions)
        
        output_matrix1_f = open(output_matrix1_fp,'w')
        output_matrix1_f.write(transformed_coords1)
        output_matrix1_f.close()
        output_matrix2_f = open(output_matrix2_fp,'w')
        output_matrix2_f.write(transformed_coords2)
        output_matrix2_f.close()
        
        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir,'trial_details_%d' % i+2)
            else:
                trial_output_dir = None
            coords_f1 = list(open(reference_input_fp,'U'))
            coords_f2 = list(open(query_input_fp,'U'))
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
             procrustes_monte_carlo(coords_f1,
                                    coords_f2,
                                    trials=random_trials,
                                    max_dimensions=num_dimensions,
                                    sample_id_map=sample_id_map,
                                    trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
            summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\
              count_better, actual_m_squared))
        else:
            summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    f = open(output_summary_fp,'w')
    f.write('\n'.join(summary_file_lines))
    f.write('\n')
    f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    random_trials = opts.random_trials
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
        
    output_dir = opts.output_dir
    sample_id_map_fp = opts.sample_id_map_fp
    num_dimensions = opts.num_dimensions
    
    if not exists(output_dir): 
        makedirs(output_dir)
    
    if opts.store_trial_details:
        trial_output_dir = '%s/trial_details/' % output_dir
    else:
        trial_output_dir = None
  
    input_fp1 = opts.input_fps[0]
    input_fp2 = opts.input_fps[1]
    input_fp1_dir, input_fn1 = split(input_fp1)
    input_fp1_basename, input_fp1_ext = splitext(input_fn1)
    input_fp2_dir, input_fn2 = split(input_fp2)
    input_fp2_basename, input_fp2_ext = splitext(input_fn2)
    output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\
     (output_dir,input_fp1_basename,input_fp2_basename)
    output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir
    output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir
    
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
      get_procrustes_results(open(input_fp1,'U'),\
                             open(input_fp2,'U'),\
                             sample_id_map=sample_id_map,\
                             randomize=False,
                             max_dimensions=num_dimensions)
    output_matrix1_f = open(output_matrix1_fp,'w')
    output_matrix1_f.write(transformed_coords1)
    output_matrix1_f.close()
    output_matrix2_f = open(output_matrix2_fp,'w')
    output_matrix2_f.write(transformed_coords2)
    output_matrix2_f.close()
    
    if random_trials:
        summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2']
        coords_f1 = list(open(input_fp1,'U'))
        coords_f2 = list(open(input_fp2,'U'))
        actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
         procrustes_monte_carlo(coords_f1,\
                                coords_f2,\
                                trials=random_trials,\
                                max_dimensions=num_dimensions,
                                sample_id_map=sample_id_map,
                                trial_output_dir=trial_output_dir)
        # truncate the p-value to the correct number of significant
        # digits
        mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
        max_dims_str = str(num_dimensions or 'alldim')
        summary_file_lines.append('%s %s %s %s %d %1.3f' %\
         (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\
          count_better, actual_m_squared))
        f = open(output_summary_fp,'w')
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
        f.close()