Ejemplo n.º 1
0
def generate_data(analysis_type, in_dir, out_dir, workflow, tree_fp,
                  ipython_profile=None):
    """Generates real and simulated data for each study.

    Distance matrices will be created at each even sampling depth and metric
    using the provided tree if necessary. Shuffled versions of each distance
    matrix will also be created, which can be used as negative controls.
    Additionally, simulated gradient or cluster data will be created at varying
    sample sizes and dissimilarity levels (using simsam.py).

    data_type should be either 'gradient' or 'cluster'.

    Will create the following (heavily nested) output directory structure:

    out_dir/
        study/
            depth/
                even depth otu table (.biom)
                real/
                    metric/
                        original/
                            map.txt
                            dm.txt
                            pc.txt
                            <category>_dm.txt (if gradient)
                        shuff_num
                            map.txt
                            dm.txt
                            pc.txt
                            <category>_dm.txt (if gradient)
                simulated/
                    category/
                        trial_num/
                            samp_size/
                                (optional) subset files dependent on samp_size
                                dissim/
                                    subset files/dirs dependent on samp_size
                                    metric/
                                        map.txt
                                        dm.txt
                                        pc.txt
                                        <category>_dm.txt (if gradient)
    """
    create_dir(out_dir)

    cmds = []
    for study in workflow:
        study_dir = join(out_dir, study)
        create_dir(study_dir)

        otu_table_fp = join(in_dir, study, 'otu_table.biom')
        map_fp = join(in_dir, study, 'map.txt')
        map_f = open(map_fp, 'U')

        for depth in workflow[study]['depths']:
            depth_dir = join(study_dir, '%d' % depth[0])
            create_dir(depth_dir)

            # Rarefy the table first since simsam.py's output tables will still
            # have even sampling depth and we don't want to lose simulated
            # samples after the fact.
            even_otu_table_fp = join(depth_dir, basename(otu_table_fp))

            if not exists(even_otu_table_fp):
                run_command('single_rarefaction.py -i %s -o %s -d %d;' % (
                        otu_table_fp, even_otu_table_fp, depth[0]))

            cmds.extend(_build_real_data_commands(analysis_type, depth_dir,
                    even_otu_table_fp, map_fp, tree_fp, workflow[study]))
            cmds.extend(_build_simulated_data_commands(analysis_type,
                    depth_dir, even_otu_table_fp, map_fp, tree_fp,
                    workflow[study]))

    run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
Ejemplo n.º 2
0
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp,
                                   map_fp, tree_fp, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'simulated')
    create_dir(data_type_dir)

    num_samps = get_num_samples_in_table(even_otu_table_fp)

    for category in workflow['categories']:
        category_dir = join(data_type_dir, category[0])
        create_dir(category_dir)

        for trial_num in range(workflow['num_sim_data_trials']):
            trial_num_dir = join(category_dir, '%d' % trial_num)
            create_dir(trial_num_dir)

            for samp_size in workflow['sample_sizes']:
                samp_size_dir = join(trial_num_dir, '%d' % samp_size)
                create_dir(samp_size_dir)

                # Lots of duplicate code between these two blocks...
                # need to refactor and test.
                if samp_size <= num_samps:
                    simsam_rep_num = 1

                    subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp))
                    subset_map_fp = join(samp_size_dir, basename(map_fp))

                    if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]):
                        run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir))
                    assert get_num_samples_in_table(subset_otu_table_fp) == samp_size
                    assert get_num_samples_in_map(subset_map_fp) == samp_size

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        # Check for simulated table/map and various
                        # distance matrices / coordinates files.
                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)]

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
                else:
                    # We need to simulate more samples than we originally have.
                    simsam_rep_num = get_simsam_rep_num(samp_size, num_samps)

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_subset_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)]

                            subset_dir = join(dissim_dir, 'subset')
                            cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir))
                            subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp))
                            subset_map_fp = join(subset_dir, basename(simsam_map_fp))

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
    return cmds