Example #1
0
def process_data(in_dir, workflow, ipython_profile=None):
    """Run statistical methods over generated data.

    For real data, creates category and method dirs for original and shuffled
    data. Under each method dir, permutation dirs will also be
    created, e.g.:

    in_dir/
        ...
            category/
                method/
                    num_perms/
                        <method>_results.txt

    For simulated data, creates method dirs under metric dirs in in_dir, e.g.:

    in_dir/
        ...
            metric/
                method/
                    <method>_results.txt
    """
    # Process each compare_categories.py/compare_distance_matrices.py run in
    # parallel.
    cmds = []
    for study in workflow:
        study_dir = join(in_dir, study)

        for depth in workflow[study]['depths']:
            depth_dir = join(study_dir, '%d' % depth[0])

            cmds.extend(_build_real_data_methods_commands(depth_dir,
                    workflow[study]))
            cmds.extend(_build_simulated_data_methods_commands(depth_dir,
                    workflow[study]))

    run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
Example #2
0
def generate_data(analysis_type, in_dir, out_dir, workflow, tree_fp,
                  ipython_profile=None):
    """Generates real and simulated data for each study.

    Distance matrices will be created at each even sampling depth and metric
    using the provided tree if necessary. Shuffled versions of each distance
    matrix will also be created, which can be used as negative controls.
    Additionally, simulated gradient or cluster data will be created at varying
    sample sizes and dissimilarity levels (using simsam.py).

    data_type should be either 'gradient' or 'cluster'.

    Will create the following (heavily nested) output directory structure:

    out_dir/
        study/
            depth/
                even depth otu table (.biom)
                real/
                    metric/
                        original/
                            map.txt
                            dm.txt
                            pc.txt
                            <category>_dm.txt (if gradient)
                        shuff_num
                            map.txt
                            dm.txt
                            pc.txt
                            <category>_dm.txt (if gradient)
                simulated/
                    category/
                        trial_num/
                            samp_size/
                                (optional) subset files dependent on samp_size
                                dissim/
                                    subset files/dirs dependent on samp_size
                                    metric/
                                        map.txt
                                        dm.txt
                                        pc.txt
                                        <category>_dm.txt (if gradient)
    """
    create_dir(out_dir)

    cmds = []
    for study in workflow:
        study_dir = join(out_dir, study)
        create_dir(study_dir)

        otu_table_fp = join(in_dir, study, 'otu_table.biom')
        map_fp = join(in_dir, study, 'map.txt')
        map_f = open(map_fp, 'U')

        for depth in workflow[study]['depths']:
            depth_dir = join(study_dir, '%d' % depth[0])
            create_dir(depth_dir)

            # Rarefy the table first since simsam.py's output tables will still
            # have even sampling depth and we don't want to lose simulated
            # samples after the fact.
            even_otu_table_fp = join(depth_dir, basename(otu_table_fp))

            if not exists(even_otu_table_fp):
                run_command('single_rarefaction.py -i %s -o %s -d %d;' % (
                        otu_table_fp, even_otu_table_fp, depth[0]))

            cmds.extend(_build_real_data_commands(analysis_type, depth_dir,
                    even_otu_table_fp, map_fp, tree_fp, workflow[study]))
            cmds.extend(_build_simulated_data_commands(analysis_type,
                    depth_dir, even_otu_table_fp, map_fp, tree_fp,
                    workflow[study]))

    run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
Example #3
0
 def test_run_parallel_jobs(self):
     """Test running jobs in parallel."""
     # Doesn't error out if no jobs are submitted, which can happend during
     # a rerun of the workflow.
     self.assertTrue(run_parallel_jobs([], int) is None)