Esempio n. 1
0
    def test_get_color_pool(self):
        """Test grabbing list of good colors to use."""
        obs = get_color_pool()
        self.assertEqual(len(obs), 27)

        obs2 = get_color_pool()
        self.assertFloatEqual(obs, obs2)
        self.assertFalse(obs is obs2)
Esempio n. 2
0
def _collate_cluster_pcoa_plot_data(coords_f, map_f, category):
    pc_data = parse_coords(coords_f)
    coords_d = dict(zip(pc_data[0], pc_data[1]))

    map_data = parse_mapping_file(map_f)
    full_map_data = [map_data[1]]
    full_map_data.extend(map_data[0])

    sid_map = group_by_field(full_map_data, category)
    sorted_states = sorted(sid_map.keys())

    color_pool = get_color_pool()
    if len(sorted_states) > len(color_pool):
        raise ValueError("Not enough colors to uniquely color sample "
                         "groups.")

    results = []
    for state, color in zip(sorted_states,
                            color_pool[:len(sorted_states)]):
        sids = sid_map[state]
        xs = [coords_d[sid][0] for sid in sids]
        ys = [coords_d[sid][1] for sid in sids]
        results.append((xs, ys, color, state))

    return results
Esempio n. 3
0
def create_simulated_data_plots(analysis_type, in_dir, workflow):
    """Create plots of sample size vs effect size/p-val for each dissim.
    
    Plots will be placed directly under in_dir and will be named according to
    the following convention:

    <study>_<category>_<depth>_<metric>.pdf
    """
    for study in workflow:
        study_dir = join(in_dir, study)

        num_trials = workflow[study]['num_sim_data_trials']
        methods = workflow[study]['methods']
        if Best() in methods:
            methods.remove(Best())
        if MantelCorrelogram() in methods:
            methods.remove(MantelCorrelogram())

        num_methods = len(methods)
        num_rows = max(num_methods, len(workflow[study]['pcoa_dissim']) + 1)
        # test stat, p-val, legend/PCoA.
        num_cols = 3

        for depth in workflow[study]['depths']:
            depth_dir = join(study_dir, '%d' % depth[0])
            data_type_dir = join(depth_dir, 'simulated')

            for category in workflow[study]['categories']:
                category_dir = join(data_type_dir, category[0])

                # metric -> Figure
                figs = {}
                for metric in workflow[study]['metrics']:
                    figs[metric[0]] = figure(num=None, figsize=(20, 20),
                                             facecolor='w', edgecolor='k')

                for method_idx, method in enumerate(methods):
                    # metric ->
                    #     dissim -> {
                    #         'sample_sizes': list,
                    #         'effect_sizes': list of lists, one for each size,
                    #         'p_vals' -> list of lists, one for each size
                    #     }
                    plots_data = defaultdict(lambda:
                            defaultdict(lambda: defaultdict(list)))

                    for trial_num in range(num_trials):
                        trial_num_dir = join(category_dir, '%d' % trial_num)

                        for samp_size in workflow[study]['sample_sizes']:
                            samp_size_dir = join(trial_num_dir,
                                                 '%d' % samp_size)

                            for d in workflow[study]['plot_dissim']:
                                dissim_dir = join(samp_size_dir, repr(d))

                                for metric in workflow[study]['metrics']:
                                    metric_dir = join(dissim_dir, metric[0])
                                    method_dir = join(metric_dir,
                                                      method.DirectoryName)

                                    results_fp = join(method_dir,
                                                      '%s_results.txt' %
                                                      method.ResultsName)
                                    effect_size, p_val = method.parse(
                                            open(results_fp, 'U'))

                                    if samp_size not in plots_data[metric[0]][d]['sample_sizes']:
                                        plots_data[metric[0]][d]['sample_sizes'].append(samp_size)
                                        plots_data[metric[0]][d]['effect_sizes'].append([])
                                        plots_data[metric[0]][d]['p_vals'].append([])

                                    samp_size_idx = plots_data[metric[0]][d]['sample_sizes'].index(samp_size)
                                    plots_data[metric[0]][d]['effect_sizes'][samp_size_idx].append(effect_size)
                                    plots_data[metric[0]][d]['p_vals'][samp_size_idx].append(p_val)

                    for metric in workflow[study]['metrics']:
                        fig = figs[metric[0]]
                        metric_plots_data = plots_data[metric[0]]

                        # plot_num is 1-based indexing.
                        plot_num = method_idx * num_cols + 1
                        ax1 = fig.add_subplot(num_rows, num_cols, plot_num)
                        ax2 = fig.add_subplot(num_rows, num_cols, plot_num + 1)

                        color_pool = get_color_pool()

                        min_dissim = min(metric_plots_data.keys())
                        max_dissim = max(metric_plots_data.keys())

                        legend_labels = []
                        legend_lines = []
                        for d, plot_data in sorted(metric_plots_data.items()):
                            avg_effect_sizes, std_effect_sizes, avg_p_vals, std_p_vals = \
                                    _compute_plot_data_statistics(plot_data, num_trials)
                            color = color_pool.pop(0)

                            label = 'd=%r' % d
                            if d == 0.0:
                                label += ' (actual data)'
                            #elif d == max_dissim:
                            #    label += ' (neg. control)'

                            legend_labels.append(label)
                            legend_lines.append(Line2D([0, 1], [0, 0],
                                                color=color, linewidth=2))

                            # Make the actual data plot a bit thicker than
                            # the rest.
                            if d == 0.0:
                                line_width = 3
                            else:
                                line_width = 1

                            # Plot test statistics.
                            ax1.errorbar(plot_data['sample_sizes'],
                                    avg_effect_sizes, yerr=std_effect_sizes,
                                    color=color, label=label,
                                    linewidth=line_width, fmt='-')

                            # Plot p-values.
                            _, _, barlinecols = ax2.errorbar(
                                    plot_data['sample_sizes'], avg_p_vals,
                                    yerr=std_p_vals, color=color, label=label,
                                    linewidth=line_width, linestyle='--')
                            barlinecols[0].set_linestyles('dashed')

                        ax1.set_xscale('log', nonposx='clip', basex=2)
                        ax1.xaxis.set_major_formatter(FormatStrFormatter('%d'))
                        ax2.set_xscale('log', nonposx='clip', basex=2)
                        ax2.xaxis.set_major_formatter(FormatStrFormatter('%d'))

                        ax2.set_yscale('log', nonposy='clip')

                        x_label = 'Number of samples'
                        ax1.set_xlabel(x_label)
                        ax2.set_xlabel(x_label)
                        ax1.set_ylabel('%s (%s)' % (method.DisplayName,
                                                    method.StatDisplayName))
                        ax2.set_ylabel('p-value')

                        min_x = min(workflow[study]['sample_sizes'])
                        max_x = max(workflow[study]['sample_sizes'])
                        ax1.set_xlim(min_x - 0.5, max_x)
                        ax2.set_xlim(min_x - 0.5, max_x)

                        for ax_idx, ax in enumerate((ax1, ax2)):
                            panel_idx = method_idx * 2 + ax_idx
                            panel_label = get_panel_label(panel_idx)
                            xmin = ax.get_xlim()[0]
                            ymin, ymax = ax.get_ylim()
                            yrange = ymax - ymin

                            # Not sure why the math isn't working out for the
                            # p-value plots...
                            if ax is ax1:
                                factor = 0.05
                            else:
                                factor = 0.60

                            ax.text(xmin, ymax + (factor * yrange),
                                    '(%s)' % panel_label)

                        if method_idx == 0:
                            ax3 = fig.add_subplot(num_rows, num_cols,
                                                  plot_num + 2, frame_on=False)
                            ax3.get_xaxis().set_visible(False)
                            ax3.get_yaxis().set_visible(False)

                            start_panel_label = get_panel_label(0)
                            end_panel_label = \
                                    get_panel_label(num_methods * 2 - 1)

                            if analysis_type == 'gradient':
                                loc='center'
                            elif analysis_type == 'cluster':
                                loc='center left'

                            assert len(legend_lines) == len(workflow[study]['plot_dissim'])
                            assert len(legend_labels) == len(workflow[study]['plot_dissim'])
                            legend_title = ('           Legend (Panels %s-%s)\nd = '
                                    '"noise" introduced to samples' % (
                                        start_panel_label, end_panel_label))
                            ax3.legend(legend_lines, legend_labels, ncol=1,
                                    title=legend_title, loc=loc, fancybox=True,
                                    shadow=True)

                for metric in workflow[study]['metrics']:
                    fig = figs[metric[0]]

                    # Plot PCoA in last column of figure.
                    plot_pcoa(analysis_type, fig, category_dir, workflow[study],
                            category, metric, num_rows, num_cols, num_methods)

                    fig.tight_layout(pad=5.0, w_pad=2.0, h_pad=2.0)
                    fig.savefig(join(in_dir, '%s_%s_%d_%s.pdf' % (study,
                            category[0], depth[0], metric[0])), format='pdf')
                    fig.savefig(join(in_dir, '%s_%s_%d_%s.png' % (study,
                            category[0], depth[0], metric[0])), format='png',
                            dpi=100)