def generate_data(analysis_type, in_dir, out_dir, workflow, tree_fp, ipython_profile=None): """Generates real and simulated data for each study. Distance matrices will be created at each even sampling depth and metric using the provided tree if necessary. Shuffled versions of each distance matrix will also be created, which can be used as negative controls. Additionally, simulated gradient or cluster data will be created at varying sample sizes and dissimilarity levels (using simsam.py). data_type should be either 'gradient' or 'cluster'. Will create the following (heavily nested) output directory structure: out_dir/ study/ depth/ even depth otu table (.biom) real/ metric/ original/ map.txt dm.txt pc.txt <category>_dm.txt (if gradient) shuff_num map.txt dm.txt pc.txt <category>_dm.txt (if gradient) simulated/ category/ trial_num/ samp_size/ (optional) subset files dependent on samp_size dissim/ subset files/dirs dependent on samp_size metric/ map.txt dm.txt pc.txt <category>_dm.txt (if gradient) """ create_dir(out_dir) cmds = [] for study in workflow: study_dir = join(out_dir, study) create_dir(study_dir) otu_table_fp = join(in_dir, study, 'otu_table.biom') map_fp = join(in_dir, study, 'map.txt') map_f = open(map_fp, 'U') for depth in workflow[study]['depths']: depth_dir = join(study_dir, '%d' % depth[0]) create_dir(depth_dir) # Rarefy the table first since simsam.py's output tables will still # have even sampling depth and we don't want to lose simulated # samples after the fact. even_otu_table_fp = join(depth_dir, basename(otu_table_fp)) if not exists(even_otu_table_fp): run_command('single_rarefaction.py -i %s -o %s -d %d;' % ( otu_table_fp, even_otu_table_fp, depth[0])) cmds.extend(_build_real_data_commands(analysis_type, depth_dir, even_otu_table_fp, map_fp, tree_fp, workflow[study])) cmds.extend(_build_simulated_data_commands(analysis_type, depth_dir, even_otu_table_fp, map_fp, tree_fp, workflow[study])) run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp, map_fp, tree_fp, workflow): cmds = [] data_type_dir = join(out_dir, 'simulated') create_dir(data_type_dir) num_samps = get_num_samples_in_table(even_otu_table_fp) for category in workflow['categories']: category_dir = join(data_type_dir, category[0]) create_dir(category_dir) for trial_num in range(workflow['num_sim_data_trials']): trial_num_dir = join(category_dir, '%d' % trial_num) create_dir(trial_num_dir) for samp_size in workflow['sample_sizes']: samp_size_dir = join(trial_num_dir, '%d' % samp_size) create_dir(samp_size_dir) # Lots of duplicate code between these two blocks... # need to refactor and test. if samp_size <= num_samps: simsam_rep_num = 1 subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp)) subset_map_fp = join(samp_size_dir, basename(map_fp)) if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]): run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir)) assert get_num_samples_in_table(subset_otu_table_fp) == samp_size assert get_num_samples_in_map(subset_map_fp) == samp_size for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) # Check for simulated table/map and various # distance matrices / coordinates files. required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)] for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) else: # We need to simulate more samples than we originally have. simsam_rep_num = get_simsam_rep_num(samp_size, num_samps) for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_subset_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)] subset_dir = join(dissim_dir, 'subset') cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir)) subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp)) subset_map_fp = join(subset_dir, basename(simsam_map_fp)) for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) return cmds