def test_get_simsam_rep_num(self): """Test getting number of necessary simsam reps.""" obs = get_simsam_rep_num(42, 13) self.assertEqual(obs, 4) obs = get_simsam_rep_num(50, 10) self.assertEqual(obs, 5) self.assertRaises(ValueError, get_simsam_rep_num, 42, 42)
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp, map_fp, tree_fp, workflow): cmds = [] data_type_dir = join(out_dir, 'simulated') create_dir(data_type_dir) num_samps = get_num_samples_in_table(even_otu_table_fp) for category in workflow['categories']: category_dir = join(data_type_dir, category[0]) create_dir(category_dir) for trial_num in range(workflow['num_sim_data_trials']): trial_num_dir = join(category_dir, '%d' % trial_num) create_dir(trial_num_dir) for samp_size in workflow['sample_sizes']: samp_size_dir = join(trial_num_dir, '%d' % samp_size) create_dir(samp_size_dir) # Lots of duplicate code between these two blocks... # need to refactor and test. if samp_size <= num_samps: simsam_rep_num = 1 subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp)) subset_map_fp = join(samp_size_dir, basename(map_fp)) if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]): run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir)) assert get_num_samples_in_table(subset_otu_table_fp) == samp_size assert get_num_samples_in_map(subset_map_fp) == samp_size for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) # Check for simulated table/map and various # distance matrices / coordinates files. required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)] for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) else: # We need to simulate more samples than we originally have. simsam_rep_num = get_simsam_rep_num(samp_size, num_samps) for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_subset_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)] subset_dir = join(dissim_dir, 'subset') cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir)) subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp)) subset_map_fp = join(subset_dir, basename(simsam_map_fp)) for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) return cmds