Ejemplo n.º 1
0
def format_method_comparison_heatmaps(real_data_results, sim_data_results,
                                      heatmap_methods):
    shared_studies = None
    shared_categories = {}

    for depth_desc, depth_res in real_data_results.items():
        for metric, metric_res in depth_res.items():
            for method, method_res in metric_res.items():
                matched_method = False
                for m in heatmap_methods:
                    if method == m.DirectoryName:
                        matched_method = True
                        break

                if matched_method:
                    studies = sorted(method_res.keys())

                    if shared_studies is None:
                        shared_studies = studies
                    elif studies != shared_studies:
                        raise ValueError("Not all methods to include in "
                                         "the heatmap have results for "
                                         "the same studies.")

                    for study, study_res in sorted(method_res.items()):
                        categories = [cat for cat, cat_res in \
                                      sorted(study_res.items()) if not
                                      is_empty(cat_res)]

                        if study not in shared_categories:
                            shared_categories[study] = set(categories)
                        else:
                            shared_categories[study] &= set(categories)

    # Gather real data effect sizes for each method (in the same order for each
    # method!).
    method_data = defaultdict(list)
    for depth_desc, depth_res in real_data_results.items():
        for metric, metric_res in depth_res.items():
            for method, method_res in metric_res.items():
                matched_method = False
                for m in heatmap_methods:
                    if method == m.DirectoryName:
                        matched_method = True
                        break

                if not matched_method:
                    continue

                for study, study_res in sorted(method_res.items()):
                    for category, category_res in sorted(study_res.items()):
                        if category in shared_categories[study]:
                            method_data[method].append(
                                    category_res['original'].effect_size)
                            method_data[method].append(
                                    category_res['shuffled'].effect_size)

    # Gather simulated data effect sizes.
    for method, method_res in sim_data_results.items():
        matched_method = False
        for m in heatmap_methods:
            if method == m.DirectoryName:
                matched_method = True
                break

        if not matched_method:
            continue

        for study, study_res in sorted(method_res.items()):
            for depth, depth_res in sorted(study_res.items()):
                for category, category_res in sorted(depth_res.items()):
                    for trial_num, trial_num_res in sorted(category_res.items()):
                        for samp_size, samp_size_res in sorted(trial_num_res.items()):
                            for dissim, dissim_res in sorted(samp_size_res.items()):
                                for metric, metric_res in sorted(dissim_res.items()):
                                    if metric_res.isEmpty():
                                        raise ValueError("Encountered empty "
                                                "simulated data results.")
                                    else:
                                        method_data[method].append(
                                                metric_res.effect_size)

    # Make sure our data looks sane. We should have the same number of
    # observations (i.e. effect sizes) for each method.
    data_length = None
    for method, data in method_data.items():
        if data_length is None:
            data_length = len(data)
        elif len(data) != data_length:
            raise ValueError("The number of observations (i.e. effect sizes) "
                             "is not the same between all methods, so we "
                             "can't compare them.")

    # Compute the correlation coefficient between each pair of methods and put
    # the output in an array. This array can then be used to generate a
    # text-based table or heatmap.
    results = {}
    for correlation_name, correlation_fn in \
            ('pearson', pearson), ('spearman', spearman):
        num_methods = len(heatmap_methods)
        heatmap_data = ones((num_methods, num_methods))

        # I know this is inefficient, but it really doesn't matter for what
        # we're doing here.
        for method1_idx, method1 in enumerate(heatmap_methods):
            for method2_idx, method2 in enumerate(heatmap_methods):
                corr_coeff = correlation_fn(method_data[method1.DirectoryName],
                                            method_data[method2.DirectoryName])
                heatmap_data[method1_idx][method2_idx] = corr_coeff

        # Mask out the upper triangle. Taken from
        # http://stackoverflow.com/a/2332520
        mask = invert(tri(heatmap_data.shape[0], k=0, dtype=bool))
        heatmap_data = numpy.ma.array(heatmap_data, mask=mask)

        results[correlation_name] = heatmap_data

    return results
Ejemplo n.º 2
0
 def test_is_empty(self):
     """Test checking if category results are empty or not."""
     self.assertTrue(is_empty(self.cat_res1))
     self.assertTrue(is_empty(self.cat_res2))
     self.assertFalse(is_empty(self.cat_res3))
     self.assertTrue(is_empty({}))