Esempio n. 1
0
def plot_pcoa(analysis_type, fig, in_dir, workflow, category, metric, num_rows,
              num_cols, num_methods):
    trial_num = 0
    samp_size = workflow['pcoa_sample_size']

    trial_num_dir = join(in_dir, '%d' % trial_num)
    samp_size_dir = join(trial_num_dir, '%d' % samp_size)

    min_dissim = min(workflow['pcoa_dissim'])
    max_dissim = max(workflow['pcoa_dissim'])

    legend_symbols = []
    legend_labels = []
    for d_idx, d in enumerate(workflow['pcoa_dissim']):
        dissim_dir = join(samp_size_dir, repr(d))
        metric_dir = join(dissim_dir, metric[0])

        pc_fp = join(metric_dir, 'pc.txt')
        map_fp = join(metric_dir, 'map.txt')

        pc_f = open(pc_fp, 'U')
        map_f = open(map_fp, 'U')
        pc_data = parse_coords(pc_f)
        pc_f.seek(0)
        assert len(pc_data[0]) == samp_size

        # Skip the first row (the legend is already at that cell).
        plot_num = (d_idx + 2) * num_cols
        ax = fig.add_subplot(num_rows, num_cols, plot_num)

        if analysis_type == 'gradient':
            # Build list of (gradient value, sid) tuples.
            xs, ys, gradient = _collate_gradient_pcoa_plot_data(pc_f, map_f,
                                                                category[0])
            scatter_colorbar_data = ax.scatter(xs, ys, s=80, c=gradient,
                                               cmap='RdYlBu')
            # We have to use gridspec to get this to work with tight_layout.
            cb = fig.colorbar(scatter_colorbar_data, use_gridspec=True)
            cb.set_label(category[1])
        elif analysis_type == 'cluster':
            plot_data = _collate_cluster_pcoa_plot_data(pc_f, map_f,
                                                        category[0])
            for xs, ys, color, state in plot_data:
                ax.scatter(xs, ys, color=color, label=state)

                if d_idx == 0:
                    legend_symbols.append(Line2D(range(1), range(1),
                                          color='white', marker='o',
                                          markeredgecolor=color,
                                          markerfacecolor=color))
                    legend_labels.append(category[2].get(state, state))
        else:
            raise ValueError("Unrecognized simulated data type '%s'." %
                             analysis_type)

        plot_title = 'd=%r' % d
        if d == 0.0:
            plot_title += ' (actual data)'
        #elif d == max_dissim:
        #    plot_title += ' (neg. control)'
        ax.set_title(plot_title)

        ax.set_xlabel('PC1 (%1.2f%%)' % pc_data[3][0])
        ax.set_ylabel('PC2 (%1.2f%%)' % pc_data[3][1])
        ax.set_xticks([])
        ax.set_yticks([])

        panel_idx = num_methods * 2 + d_idx
        panel_label = get_panel_label(panel_idx)
        xmin = ax.get_xlim()[0]
        ymin, ymax = ax.get_ylim()
        yrange = ymax - ymin
        ax.text(xmin, ymax + (0.04 * yrange), '(%s)' % panel_label)

    if analysis_type == 'cluster':
        # Plot our new legend and add the existing one back.
        legend_ax = fig.add_subplot(num_rows, num_cols, 3, frame_on=False)
        existing_legend = legend_ax.get_legend()
        existing_legend.set_bbox_to_anchor((-0.05, 0.5))

        start_panel_label = get_panel_label(num_methods * 2)
        end_panel_label = get_panel_label(num_methods * 2 +
                                          len(workflow['pcoa_dissim']) - 1)

        assert len(legend_symbols) == len(legend_labels)
        legend_ax.legend(legend_symbols, legend_labels, ncol=1,
                   title='Legend (Panels %s-%s)' % (start_panel_label,
                                                    end_panel_label),
                   loc='center right', fancybox=True, shadow=True, numpoints=1,
                   bbox_to_anchor=(1.05, 0.5))

        legend_ax.add_artist(existing_legend)

    # Draw box around PCoA plots. Do the math in figure coordinates.
    top_ax = fig.add_subplot(num_rows, num_cols, 6)
    rec = Rectangle((1 - (1 / num_cols) + 0.005, 0),
                    (1 / num_cols) - 0.005,
                    1 - (1 / num_rows) - 0.005,
                    fill=False, lw=2, clip_on=False,
                    transform=top_ax.figure.transFigure)
    top_ax.add_patch(rec)
Esempio n. 2
0
def create_simulated_data_plots(analysis_type, in_dir, workflow):
    """Create plots of sample size vs effect size/p-val for each dissim.
    
    Plots will be placed directly under in_dir and will be named according to
    the following convention:

    <study>_<category>_<depth>_<metric>.pdf
    """
    for study in workflow:
        study_dir = join(in_dir, study)

        num_trials = workflow[study]['num_sim_data_trials']
        methods = workflow[study]['methods']
        if Best() in methods:
            methods.remove(Best())
        if MantelCorrelogram() in methods:
            methods.remove(MantelCorrelogram())

        num_methods = len(methods)
        num_rows = max(num_methods, len(workflow[study]['pcoa_dissim']) + 1)
        # test stat, p-val, legend/PCoA.
        num_cols = 3

        for depth in workflow[study]['depths']:
            depth_dir = join(study_dir, '%d' % depth[0])
            data_type_dir = join(depth_dir, 'simulated')

            for category in workflow[study]['categories']:
                category_dir = join(data_type_dir, category[0])

                # metric -> Figure
                figs = {}
                for metric in workflow[study]['metrics']:
                    figs[metric[0]] = figure(num=None, figsize=(20, 20),
                                             facecolor='w', edgecolor='k')

                for method_idx, method in enumerate(methods):
                    # metric ->
                    #     dissim -> {
                    #         'sample_sizes': list,
                    #         'effect_sizes': list of lists, one for each size,
                    #         'p_vals' -> list of lists, one for each size
                    #     }
                    plots_data = defaultdict(lambda:
                            defaultdict(lambda: defaultdict(list)))

                    for trial_num in range(num_trials):
                        trial_num_dir = join(category_dir, '%d' % trial_num)

                        for samp_size in workflow[study]['sample_sizes']:
                            samp_size_dir = join(trial_num_dir,
                                                 '%d' % samp_size)

                            for d in workflow[study]['plot_dissim']:
                                dissim_dir = join(samp_size_dir, repr(d))

                                for metric in workflow[study]['metrics']:
                                    metric_dir = join(dissim_dir, metric[0])
                                    method_dir = join(metric_dir,
                                                      method.DirectoryName)

                                    results_fp = join(method_dir,
                                                      '%s_results.txt' %
                                                      method.ResultsName)
                                    effect_size, p_val = method.parse(
                                            open(results_fp, 'U'))

                                    if samp_size not in plots_data[metric[0]][d]['sample_sizes']:
                                        plots_data[metric[0]][d]['sample_sizes'].append(samp_size)
                                        plots_data[metric[0]][d]['effect_sizes'].append([])
                                        plots_data[metric[0]][d]['p_vals'].append([])

                                    samp_size_idx = plots_data[metric[0]][d]['sample_sizes'].index(samp_size)
                                    plots_data[metric[0]][d]['effect_sizes'][samp_size_idx].append(effect_size)
                                    plots_data[metric[0]][d]['p_vals'][samp_size_idx].append(p_val)

                    for metric in workflow[study]['metrics']:
                        fig = figs[metric[0]]
                        metric_plots_data = plots_data[metric[0]]

                        # plot_num is 1-based indexing.
                        plot_num = method_idx * num_cols + 1
                        ax1 = fig.add_subplot(num_rows, num_cols, plot_num)
                        ax2 = fig.add_subplot(num_rows, num_cols, plot_num + 1)

                        color_pool = get_color_pool()

                        min_dissim = min(metric_plots_data.keys())
                        max_dissim = max(metric_plots_data.keys())

                        legend_labels = []
                        legend_lines = []
                        for d, plot_data in sorted(metric_plots_data.items()):
                            avg_effect_sizes, std_effect_sizes, avg_p_vals, std_p_vals = \
                                    _compute_plot_data_statistics(plot_data, num_trials)
                            color = color_pool.pop(0)

                            label = 'd=%r' % d
                            if d == 0.0:
                                label += ' (actual data)'
                            #elif d == max_dissim:
                            #    label += ' (neg. control)'

                            legend_labels.append(label)
                            legend_lines.append(Line2D([0, 1], [0, 0],
                                                color=color, linewidth=2))

                            # Make the actual data plot a bit thicker than
                            # the rest.
                            if d == 0.0:
                                line_width = 3
                            else:
                                line_width = 1

                            # Plot test statistics.
                            ax1.errorbar(plot_data['sample_sizes'],
                                    avg_effect_sizes, yerr=std_effect_sizes,
                                    color=color, label=label,
                                    linewidth=line_width, fmt='-')

                            # Plot p-values.
                            _, _, barlinecols = ax2.errorbar(
                                    plot_data['sample_sizes'], avg_p_vals,
                                    yerr=std_p_vals, color=color, label=label,
                                    linewidth=line_width, linestyle='--')
                            barlinecols[0].set_linestyles('dashed')

                        ax1.set_xscale('log', nonposx='clip', basex=2)
                        ax1.xaxis.set_major_formatter(FormatStrFormatter('%d'))
                        ax2.set_xscale('log', nonposx='clip', basex=2)
                        ax2.xaxis.set_major_formatter(FormatStrFormatter('%d'))

                        ax2.set_yscale('log', nonposy='clip')

                        x_label = 'Number of samples'
                        ax1.set_xlabel(x_label)
                        ax2.set_xlabel(x_label)
                        ax1.set_ylabel('%s (%s)' % (method.DisplayName,
                                                    method.StatDisplayName))
                        ax2.set_ylabel('p-value')

                        min_x = min(workflow[study]['sample_sizes'])
                        max_x = max(workflow[study]['sample_sizes'])
                        ax1.set_xlim(min_x - 0.5, max_x)
                        ax2.set_xlim(min_x - 0.5, max_x)

                        for ax_idx, ax in enumerate((ax1, ax2)):
                            panel_idx = method_idx * 2 + ax_idx
                            panel_label = get_panel_label(panel_idx)
                            xmin = ax.get_xlim()[0]
                            ymin, ymax = ax.get_ylim()
                            yrange = ymax - ymin

                            # Not sure why the math isn't working out for the
                            # p-value plots...
                            if ax is ax1:
                                factor = 0.05
                            else:
                                factor = 0.60

                            ax.text(xmin, ymax + (factor * yrange),
                                    '(%s)' % panel_label)

                        if method_idx == 0:
                            ax3 = fig.add_subplot(num_rows, num_cols,
                                                  plot_num + 2, frame_on=False)
                            ax3.get_xaxis().set_visible(False)
                            ax3.get_yaxis().set_visible(False)

                            start_panel_label = get_panel_label(0)
                            end_panel_label = \
                                    get_panel_label(num_methods * 2 - 1)

                            if analysis_type == 'gradient':
                                loc='center'
                            elif analysis_type == 'cluster':
                                loc='center left'

                            assert len(legend_lines) == len(workflow[study]['plot_dissim'])
                            assert len(legend_labels) == len(workflow[study]['plot_dissim'])
                            legend_title = ('           Legend (Panels %s-%s)\nd = '
                                    '"noise" introduced to samples' % (
                                        start_panel_label, end_panel_label))
                            ax3.legend(legend_lines, legend_labels, ncol=1,
                                    title=legend_title, loc=loc, fancybox=True,
                                    shadow=True)

                for metric in workflow[study]['metrics']:
                    fig = figs[metric[0]]

                    # Plot PCoA in last column of figure.
                    plot_pcoa(analysis_type, fig, category_dir, workflow[study],
                            category, metric, num_rows, num_cols, num_methods)

                    fig.tight_layout(pad=5.0, w_pad=2.0, h_pad=2.0)
                    fig.savefig(join(in_dir, '%s_%s_%d_%s.pdf' % (study,
                            category[0], depth[0], metric[0])), format='pdf')
                    fig.savefig(join(in_dir, '%s_%s_%d_%s.png' % (study,
                            category[0], depth[0], metric[0])), format='png',
                            dpi=100)