def test_procrustes_monte_carlo(self): """ sanity test of procrustes_monte_carlo wrapper function THIS TEST MAY RANDOMLY FAIL BECAUSE IT IS BASED ON TESTING RANDOM PERMUTATIONS, BUT THAT SHOULD BE RARE. """ def shuffle_f(coords): """ A fake shuffle function -- used to avoid random test failures returns a re-ordered coords2 """ return array( [ [-0.16713783, 0.22321481, 0.33766418, 0.22785083, -0.23830446, -0.18754852], [-0.14864343, 0.07290181, -0.06250315, 0.03201972, -0.0966749, -0.10337987], [0.35725269, -0.00761567, 0.09044279, -0.21006839, -0.01355589, -0.04590791], [0.26535811, 0.09772598, 0.04339214, -0.21014987, 0.14089095, -0.10261849], ] ) actual = procrustes_monte_carlo(StringIO(pcoa1_f), StringIO(pcoa2_f), trials=100, shuffle_f=shuffle_f) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed expected_actual_m2 = 0.0211 expected_len_trial_m2 = 100 expected_count_better = 0 expected_p_value = 0.0 self.assertAlmostEqual(actual[0], expected_actual_m2, 3) self.assertEqual(len(actual[1]), expected_len_trial_m2) self.assertEqual(actual[2], expected_count_better) self.assertEqual(actual[3], expected_p_value)
def test_procrustes_monte_carlo(self): """ sanity test of procrustes_monte_carlo wrapper function THIS TEST MAY RANDOMLY FAIL BECAUSE IT IS BASED ON TESTING RANDOM PERMUTATIONS, BUT THAT SHOULD BE RARE. """ def shuffle_f(coords): """ A fake shuffle function -- used to avoid random test failures returns a re-ordered coords2 """ return array(\ [[-0.16713783, 0.22321481, 0.33766418, 0.22785083, -0.23830446, -0.18754852], [-0.14864343, 0.07290181, -0.06250315, 0.03201972, -0.0966749 , -0.10337987], [ 0.35725269, -0.00761567, 0.09044279, -0.21006839, -0.01355589, -0.04590791], [ 0.26535811, 0.09772598, 0.04339214, -0.21014987, 0.14089095, -0.10261849]]) actual = procrustes_monte_carlo(self.pcoa1_f, self.pcoa2_f, trials=100, shuffle_f=shuffle_f) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed expected_actual_m2 = 0.0211 expected_len_trial_m2 = 100 expected_count_better = 0 expected_p_value = 0.0 self.assertAlmostEqual(actual[0], expected_actual_m2, 3) self.assertEqual(len(actual[1]), expected_len_trial_m2) self.assertEqual(actual[2], expected_count_better) self.assertEqual(actual[3], expected_p_value)
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999): """ Compute Procrustes M2 and p-values for a set of results result_tables: 2d list of tables to be compared to expected tables, where the data in the inner list is: [dataset_id, reference_database_id, method_id, parameter_combination_id, table_fp] expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal coordinate matrices, for the expected result coordinate matrices taxonomy_level: level to compute results """ ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is ### in order here. *ALMOST refers to changes to parser and variable names since expected ### is a pc matrix here. for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables: ## parse the expected table (unless taxonomy_level is specified, this should be ## collapsed on level 6 taxonomy) try: expected_pc_fp = expected_pc_lookup[dataset_id][reference_id] except KeyError: raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id) ## parse the actual table and collapse it at the specified taxonomic level try: actual_table = parse_biom_table(open(actual_table_fp, "U")) except ValueError: raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level) actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy) ### End code copied directly from compute_prfs. # Next block of code, how do I hate thee? Let me count the ways... # (1) dist_bray_curtis doesn't take a BIOM Table object # (2) pcoa takes a qiime-formatted distance matrix as a list of lines # (3) pcoa return a qiime-formatted pc matrix # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple # times, so we actually *need* those the pcs that get passed in to be # lists of lines dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()])) formatted_dm = format_distance_matrix(actual_table.SampleIds, dm) actual_pc = pcoa(formatted_dm.split("\n")).split("\n") expected_pc = list(open(expected_pc_fp, "U")) ## run Procrustes analysis with monte carlo simulation actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo( expected_pc, actual_pc, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=None, trial_output_dir=None, ) yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
def test_procrustes_monte_carlo(self): """ sanity test of procrustes_monte_carlo wrapper function""" actual = procrustes_monte_carlo(self.pcoa1_f,self.pcoa2_f,trials=10) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed expected_actual_m2 = 0.0211 expected_len_trial_m2 = 10 expected_count_better = 0 expected_p_value = 0.0 self.assertAlmostEqual(actual[0],expected_actual_m2,3) self.assertEqual(len(actual[1]),expected_len_trial_m2) self.assertEqual(actual[2],expected_count_better) self.assertEqual(actual[3],expected_p_value)
def test_procrustes_monte_carlo(self): """ sanity test of procrustes_monte_carlo wrapper function""" actual = procrustes_monte_carlo(self.pcoa1_f, self.pcoa2_f, trials=10) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed expected_actual_m2 = 0.0211 expected_len_trial_m2 = 10 expected_count_better = 0 expected_p_value = 0.0 self.assertAlmostEqual(actual[0], expected_actual_m2, 3) self.assertEqual(len(actual[1]), expected_len_trial_m2) self.assertEqual(actual[2], expected_count_better) self.assertEqual(actual[3], expected_p_value)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials is not None and random_trials < 10: option_parser.error( 'Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be ' 'exactly one fewer sample id maps than input ' 'coordinate matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir, 'procrustes_results.txt') summary_file_lines = [ '#FP1\tFP2\tNum included dimensions\tMonte Carlo ' 'p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently ' 'adjusted for multiple comparisons.' ] for i, query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join( output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join( output_dir, '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1)) if sample_id_map_fps: with open(sample_id_map_fps[i], "U") as f: sample_id_map = dict([ (k, v[0]) for k, v in fields_to_dict(f).iteritems() ]) else: sample_id_map = None with open(reference_input_fp, 'U') as ref_in_f: with open(query_input_fp, 'U') as query_in_f: transf_coords1, transf_coords2, m_squared, rand_coords2 =\ get_procrustes_results(ref_in_f, query_in_f, sample_id_map=sample_id_map, randomize=False, max_dimensions=num_dimensions) transf_coords1.write(output_matrix1_fp) transf_coords2.write(output_matrix2_fp) if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2) else: trial_output_dir = None coords_f1 = open(reference_input_fp, 'U') coords_f2 = open(query_input_fp, 'U') actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters( mc_p_value, random_trials) summary_file_lines.append( '%s\t%s\t%s\t%s\t%d\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str, count_better, actual_m_squared)) else: summary_file_lines.append( '%s\t%s\t%s\tNA\tNA\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary with open(output_summary_fp, 'w') as f: f.write('\n'.join(summary_file_lines)) f.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be exactly' ' one fewer sample id maps than input coordinate' ' matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir,'procrustes_results.txt') summary_file_lines = \ ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.'] for i,query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join(output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join(output_dir,\ '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1)) if sample_id_map_fps: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(reference_input_fp,'U'),\ open(query_input_fp,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir,'trial_details_%d' % i+2) else: trial_output_dir = None coords_f1 = list(open(reference_input_fp,'U')) coords_f2 = list(open(query_input_fp,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\ count_better, actual_m_squared)) else: summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') output_dir = opts.output_dir sample_id_map_fp = opts.sample_id_map_fp num_dimensions = opts.num_dimensions if not exists(output_dir): makedirs(output_dir) if opts.store_trial_details: trial_output_dir = '%s/trial_details/' % output_dir else: trial_output_dir = None input_fp1 = opts.input_fps[0] input_fp2 = opts.input_fps[1] input_fp1_dir, input_fn1 = split(input_fp1) input_fp1_basename, input_fp1_ext = splitext(input_fn1) input_fp2_dir, input_fn2 = split(input_fp2) input_fp2_basename, input_fp2_ext = splitext(input_fn2) output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\ (output_dir,input_fp1_basename,input_fp2_basename) output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir if sample_id_map_fp: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(input_fp1,'U'),\ open(input_fp2,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2'] coords_f1 = list(open(input_fp1,'U')) coords_f2 = list(open(input_fp2,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1,\ coords_f2,\ trials=random_trials,\ max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) max_dims_str = str(num_dimensions or 'alldim') summary_file_lines.append('%s %s %s %s %d %1.3f' %\ (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\ count_better, actual_m_squared)) f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()