Example #1
0
 def test_one_empty(self):
     set_ = set(np.random.randint(1, 1000, size=100))
     sets = [set_, set()]
     chunks, arr = get_chunks_and_composition_array(sets)
     self.assertEqual(chunks, [set_])
     self.assertEqual(arr.shape, (2, 1))
     self.assertEqual(list(arr[:, 0]), [1, 0])
Example #2
0
 def test_two_sets(self):
     sets = [{1, 2, 3}, {3, 4}]
     chunks, arr = get_chunks_and_composition_array(sets)
     self.assertEqual(
         freeze_sets(chunks),
         {frozenset([1, 2]),
          frozenset([3]), frozenset([4])})
     self.assertTrue(array_is_binary(arr))
Example #3
0
 def test_disjoint_large(self):
     set_count = 4
     sets = [set(np.random.randint(1000 * i, 1000 * (i + 1), size=100 * (i + 1))) for i in range(set_count)]
     chunks, arr = get_chunks_and_composition_array(sets)
     self.assertEqual(freeze_sets(sets), freeze_sets(chunks))
     # Verify that array is made of zeros and ones
     self.assertTrue(array_is_binary(arr))
     # Verify that each row and each column of arr sum to 1
     self.assertEqual(set(arr.sum(0)), {1})
     self.assertEqual(set(arr.sum(1)), {1})
Example #4
0
 def test_order_sets_random(self):
     done = False
     while not done:
         sets = make_random_sets(min_sets_count=9)
         chunks, composition_matrix = get_chunks_and_composition_array(sets)
         if len(chunks) == 1:
             continue
         representations_set = set()
         for i in range(10):
             _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, sets_ordering='random')
             representations_set.add(str(composition_matrix))
         self.assertGreater(len(representations_set), 1)
         done = True
Example #5
0
 def test_chunks_and_array(self):
     """
     For random sets, test that the you indeed can recreate every original set as the union of chunks indicated in
     the corresponding row of array.
     """
     for _ in range(100):
         sets = make_random_sets()
         chunks, arr = get_chunks_and_composition_array(sets)
         for (set_, row) in zip(sets, arr):
             chunks_in_this_set = [chunk for is_included, chunk in zip(row, chunks) if is_included]
             recreated_set = set.union(*chunks_in_this_set)
             self.assertEqual(set_, recreated_set)
             self.assertEqual(sum(len(chunk) for chunk in chunks_in_this_set), len(set_))
Example #6
0
 def test_chunks_for_random_sets(self):
     """
     For random sets, test that
     1) the union of sets is the disjoint union of chunks
     2) each chunk is either completely inside any set either completely outside
     :return:
     """
     for _ in range(100):
         sets = make_random_sets()
         chunks, arr = get_chunks_and_composition_array(sets)
         all_elements = set.union(*sets)
         self.assertEqual(set.union(*chunks), all_elements)
         self.assertEqual(sum(len(chunk) for chunk in chunks), len(all_elements))
         for chunk, set_ in product(chunks, sets):
             self.assertTrue(not chunk - set_ or chunk - set_ == chunk)
Example #7
0
def supervenn(sets, set_annotations=None, figsize=None, side_plots=True,
              chunks_ordering='minimize gaps', sets_ordering=None,
              reverse_chunks_order=True, reverse_sets_order=True,
              max_bruteforce_size=DEFAULT_MAX_BRUTEFORCE_SIZE, seeds=DEFAULT_SEEDS, noise_prob=DEFAULT_NOISE_PROB,
              side_plot_width=1, min_width_for_annotation=1, widths_minmax_ratio=None, side_plot_color='gray',
              dpi=None, ax=None, **kw):
    """
    Plot a diagram visualizing relationship of multiple sets.
    :param sets: list of sets
    :param set_annotations: list of annotations for the sets
    :param figsize: figure size
    :param side_plots: True / False: add small barplots on top and on the right. On top, for each chunk it is shown,
    how many sets does this chunk lie inslde. On the right, set sizes are shown.
    :param chunks_ordering: method of ordering the chunks (columns of the grid)
        - 'minimize gaps' (default): use a smart algorithm to find an order of columns giving fewer gaps in each row,
            making the plot as readable as possible.
        - 'size': bigger chunks go first (or last if reverse_chunks_order=False)
        - 'occurence': chunks that are in most sets go first (or last if reverse_chunks_order=False)
        - 'random': randomly shuffle the columns
    :param sets_ordering: method of ordering the sets (rows of the grid)
        - None (default): keep the order as it is passed
        - 'minimize gaps': use a smart algorithm to find an order of rows giving fewer gaps in each column
        - 'size': bigger sets go first (or last if reverse_sets_order = False)
        - 'chunk count': sets that contain most chunks go first (or last if reverse_sets_order = False)
        - 'random': randomly shuffle
    :param reverse_chunks_order: True (default) / False when chunks_ordering is "size" or "occurence",
        chunks with bigger corresponding property go first if reverse_chunks_order=True, smaller go first if False.
    :param reverse_sets_order: True / False, works the same way as reverse_chunks_order
    :param max_bruteforce_size: maximal number of items for which bruteforce method is applied to find permutation
    :param seeds: number of different random seeds for the randomized greedy algorithm to find permutation
    :param noise_prob: probability of given element being equal to 1 in the noise array for randomized greedy algorithm
    :param side_plot_width: width of side plots in inches (default 1.5)
    :param side_plot_color: color of bars in side plots, default 'gray'
    :param dpi: figure DPI
    :param ax: axis to plot into. If ax is specified, figsize and dpi will be ignored.
    :param min_width_for_annotation: for horizontal plot, don't annotate bars of widths less than this value (to avoid
    clutter)
    :param widths_minmax_ratio: desired max/min ratio of displayed chunk widths, default None (show actual widths)
    :param rotate_col_annotations: True / False, whether to print annotations vertically
    :param fontsize: font size for all text elements
    :param row_annotations_y: a number in (0, 1), position for row annotations in the row. Default 0.5 - center of row.
    :param col_annotations_area_height: height of area for column annotations in inches, 1 by default
    :param col_annotations_ys_count: 1 (default), 2, or 3 - use to reduce clutter in column annotations area
    :param color_by: 'row' (default) or 'column'. If 'row', all cells in same row are same color, etc.
    :param bar_height: height of cell fill as a fraction of row height, a number in (0, 1).
    :param bar_alpha: alpha for cell fills.
    :param bar_align: vertical alignment of bars, 'edge' (default) or 'center'. Only matters when bar_height < 1.
    :param color_cycle: a list of set colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4')

    :return: SupervennPlot instance with attributes `axes`, `figure`, `chunks`
        and method `get_chunk(set_indices)`. See docstring to returned object.
    """

    if figsize is not None or dpi is not None:
        warnings.warn(
            'Parameters figsize and dpi of supervenn() are deprecated and will be removed in a future version.\n'
            'Instead of this:\n'
            '    supervenn(sets, figsize=(8, 5), dpi=90)'
            '\nPlease either do this:\n'
            '    plt.figure(figsize=(8, 5), dpi=90)\n'
            '    supervenn(sets)\n'
            'or plot into an existing axis by passing it as the ax argument:\n'
            '    supervenn(sets, ax=my_axis)\n'
        )

    axes = setup_axes(side_plots, figsize, dpi, ax, side_plot_width)

    if set_annotations is None:
        set_annotations = ['Set_{}'.format(i) for i in range(len(sets))]

    chunks, composition_array = get_chunks_and_composition_array(sets)

    # Find permutations of rows and columns
    permutations_ = get_permutations(
        chunks,
        composition_array,
        chunks_ordering=chunks_ordering,
        sets_ordering=sets_ordering,
        reverse_chunks_order=reverse_chunks_order,
        reverse_sets_order=reverse_sets_order,
        max_bruteforce_size=max_bruteforce_size,
        seeds=seeds,
        noise_prob=noise_prob)

    # Apply permutations
    chunks = [chunks[i] for i in permutations_['chunks_ordering']]
    composition_array = composition_array[:, permutations_['chunks_ordering']]
    composition_array = composition_array[permutations_['sets_ordering'], :]
    set_annotations = [set_annotations[i] for i in permutations_['sets_ordering']]

    # Main plot
    chunk_sizes = [len(chunk) for chunk in chunks]

    if widths_minmax_ratio is not None:
        widths_balancer = get_widths_balancer(chunk_sizes, widths_minmax_ratio)
        col_widths = [widths_balancer(chunk_size) for chunk_size in chunk_sizes]
        effective_min_width_for_annotation = widths_balancer(min_width_for_annotation)
    else:
        col_widths = chunk_sizes
        effective_min_width_for_annotation = min_width_for_annotation

    plot_binary_array(
        arr=composition_array,
        row_annotations=set_annotations,
        col_annotations=chunk_sizes,
        ax=axes['main'],
        col_widths=col_widths,
        row_heights=[1] * len(sets),
        min_width_for_annotation=effective_min_width_for_annotation,
        **kw)

    xlim = axes['main'].get_xlim()
    ylim = axes['main'].get_ylim()
    plt.xlabel('ITEMS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE))
    plt.ylabel('SETS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE))

    # Side plots
    if side_plots:
        fontsize = kw.get('fontsize', DEFAULT_FONTSIZE)

        plt.sca(axes['top_side_plot'])
        side_plot(composition_array.sum(0), col_widths, 'h',
                  min_width_for_annotation=effective_min_width_for_annotation,
                  rotate_annotations=kw.get('rotate_col_annotations', False), color=side_plot_color, fontsize=fontsize)
        plt.xlim(xlim)

        plt.sca(axes['right_side_plot'])
        side_plot([len(sets[i]) for i in permutations_['sets_ordering']], [1] * len(sets), 'v', color=side_plot_color,
                  fontsize=fontsize)
        plt.ylim(ylim)

    plt.sca(axes['main'])
    return SupervennPlot(axes, plt.gcf(), break_into_chunks(sets))  # todo: break_into_chunks is called twice, fix
Example #8
0
def supervenn(sets,
              set_annotations=None,
              figsize=DEFAULT_FIGSIZE,
              side_plots=True,
              chunks_ordering='minimize gaps',
              sets_ordering=None,
              reverse_chunks_order=True,
              reverse_sets_order=True,
              max_bruteforce_size=DEFAULT_MAX_BRUTEFORCE_SIZE,
              seeds=DEFAULT_SEEDS,
              noise_prob=DEFAULT_NOISE_PROB,
              side_plot_width=1.5,
              min_width_for_annotation=1,
              widths_minmax_ratio=0,
              side_plot_color='gray',
              **kw):
    """
    Plot a diagram visualizing relationship of multiple sets.
    :param sets: list of sets
    :param set_annotations: list of annotations for the sets
    :param figsize: figure size
    :param side_plots: True / False: add small barplots on top and on the right. On top, for each chunk it is shown,
    how many sets does this chunk lie inslde. On the right, set sizes are shown.
    :param chunks_ordering: method of ordering the chunks (columns of the grid)
        - 'minimize gaps' (default): use a smart algorithm to find an order of columns giving fewer gaps in each row,
            making the plot as readable as possible.
        - 'size': bigger chunks go first (or last if reverse_chunks_order=False)
        - 'occurence': chunks that are in most sets go first (or last if reverse_chunks_order=False)
        - 'random': randomly shuffle the columns
    :param sets_ordering: method of ordering the sets (rows of the grid)
        - None (default): keep the order as it is passed
        - 'minimize gaps': use a smart algorithm to find an order of rows giving fewer gaps in each column
        - 'size': bigger sets go first (or last if reverse_sets_order = False)
        - 'chunk count': sets that contain most chunks go first (or last if reverse_sets_order = False)
        - 'random': randomly shuffle
    :param reverse_chunks_order: True (default) / False when chunks_ordering is "size" or "occurence",
        chunks with bigger corresponding property go first if reverse_chunks_order=True, smaller go first if False.
    :param reverse_sets_order: True / False, works the same way as reverse_chunks_order
    :param max_bruteforce_size: maximal number of items for which bruteforce method is applied to find permutation
    :param seeds: number of different random seeds for the randomized greedy algorithm to find permutation
    :param noise_prob: probability of given element being equal to 1 in the noise array for randomized greedy algorithm
    :param side_plot_width: width of side plots in inches (default 1.5)
    :param side_plot_color: color of bars in side plots, default 'gray'
    :param min_width_for_annotation: for horizontal plot, don't annotate bars of widths less than this value (to avoid
    clutter)
    :param widths_minmax_ratio: desired max/min ratio of displayed chunk widths, default None (show actual widths)
    :param row_annotations_y: a number in (0, 1), position for row annotations in the row. Default 0.5 - center of row.
    :param col_annotations_area_height: height of area for column annotations in inches, 1 by default
    :param col_annotations_ys_count: 1 (default), 2, or 3 - use to reduce clutter in column annotations area
    :param color_by: 'row' (default) or 'column'. If 'row', all cells in same row are same color, etc.
    :param bar_height: height of cell fill as a fraction of row height, a number in (0, 1).
    :param bar_alpha: alpha for cell fills.
    :param bar_align: vertical alignment of bars, 'edge' (defaulr) or 'center'. Only matters when bar_height < 1.
    :param color_cycle: a list of colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4')
    """
    # Set up axes
    if side_plots:

        fig_width, fig_height = figsize

        height_ratios = [side_plot_width, fig_height - side_plot_width]
        width_ratios = [fig_width - side_plot_width, side_plot_width]

        _, axes = plt.subplots(2,
                               2,
                               figsize=figsize,
                               gridspec_kw={
                                   'height_ratios': height_ratios,
                                   'width_ratios': width_ratios,
                                   'hspace': 0.0,
                                   'wspace': 0.0
                               })
        for side_ax in axes[0, 1], axes[1, 1], axes[0, 0]:
            side_ax.set_xticks([])
            side_ax.set_yticks([])
        main_ax = axes[1, 0]
    else:
        plt.figure(figsize=figsize)
        main_ax = plt.gca()

    if set_annotations is None:
        set_annotations = ['Set_{}'.format(i) for i in range(len(sets))]

    chunks, composition_array = get_chunks_and_composition_array(sets)

    # Find permutations of rows and columns
    permutations_ = get_permutations(chunks,
                                     composition_array,
                                     chunks_ordering=chunks_ordering,
                                     sets_ordering=sets_ordering,
                                     reverse_chunks_order=reverse_chunks_order,
                                     reverse_sets_order=reverse_sets_order,
                                     max_bruteforce_size=max_bruteforce_size,
                                     seeds=seeds,
                                     noise_prob=noise_prob)

    # Apply permutations
    chunks = [chunks[i] for i in permutations_['chunks_ordering']]
    composition_array = composition_array[:, permutations_['chunks_ordering']]
    composition_array = composition_array[permutations_['sets_ordering'], :]
    set_annotations = [
        set_annotations[i] for i in permutations_['sets_ordering']
    ]

    # Main plot
    chunk_sizes = [len(chunk) for chunk in chunks]

    col_widths = balance_widths(
        chunk_sizes, widths_minmax_ratio
    ) if widths_minmax_ratio is not None else chunk_sizes

    plot_binary_array(arr=composition_array,
                      row_annotations=set_annotations,
                      col_annotations=chunk_sizes,
                      ax=main_ax,
                      col_widths=col_widths,
                      row_heights=[1] * len(sets),
                      min_width_for_annotation=min_width_for_annotation,
                      **kw)

    xlim = main_ax.get_xlim()
    ylim = main_ax.get_ylim()
    plt.xlabel('ITEMS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE))
    plt.ylabel('SETS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE))

    # Side plots
    if side_plots:
        plt.sca(axes[0, 0])
        side_plot(composition_array.sum(0),
                  col_widths,
                  'h',
                  min_width_for_annotation=min_width_for_annotation,
                  rotate_annotations=kw.get('rotate_col_annotations', False),
                  color=side_plot_color)
        plt.xlim(xlim)

        plt.sca(axes[1, 1])
        side_plot([len(sets[i]) for i in permutations_['sets_ordering']],
                  [1] * len(sets),
                  'v',
                  color=side_plot_color)
        plt.ylim(ylim)

    plt.sca(main_ax)
Example #9
0
 def test_disjoint_small(self):
     sets = [{1}, {2}]
     chunks, arr = get_chunks_and_composition_array(sets)
     self.assertEqual(freeze_sets(sets), freeze_sets(chunks))
     self.assertTrue(np.array_equal(arr, np.eye(2, dtype=int)))  # fixme can be not eye
Example #10
0
 def test_single_set(self):
     sets = [set(np.random.randint(1, 1000, size=100))]
     chunks, arr = get_chunks_and_composition_array(sets)
     self.assertEqual(chunks, sets)
     self.assertEqual(arr.shape, (1, 1))
     self.assertEqual(arr[0, 0], 1)