def test_one_empty(self): set_ = set(np.random.randint(1, 1000, size=100)) sets = [set_, set()] chunks, arr = get_chunks_and_composition_array(sets) self.assertEqual(chunks, [set_]) self.assertEqual(arr.shape, (2, 1)) self.assertEqual(list(arr[:, 0]), [1, 0])
def test_two_sets(self): sets = [{1, 2, 3}, {3, 4}] chunks, arr = get_chunks_and_composition_array(sets) self.assertEqual( freeze_sets(chunks), {frozenset([1, 2]), frozenset([3]), frozenset([4])}) self.assertTrue(array_is_binary(arr))
def test_disjoint_large(self): set_count = 4 sets = [set(np.random.randint(1000 * i, 1000 * (i + 1), size=100 * (i + 1))) for i in range(set_count)] chunks, arr = get_chunks_and_composition_array(sets) self.assertEqual(freeze_sets(sets), freeze_sets(chunks)) # Verify that array is made of zeros and ones self.assertTrue(array_is_binary(arr)) # Verify that each row and each column of arr sum to 1 self.assertEqual(set(arr.sum(0)), {1}) self.assertEqual(set(arr.sum(1)), {1})
def test_order_sets_random(self): done = False while not done: sets = make_random_sets(min_sets_count=9) chunks, composition_matrix = get_chunks_and_composition_array(sets) if len(chunks) == 1: continue representations_set = set() for i in range(10): _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, sets_ordering='random') representations_set.add(str(composition_matrix)) self.assertGreater(len(representations_set), 1) done = True
def test_chunks_and_array(self): """ For random sets, test that the you indeed can recreate every original set as the union of chunks indicated in the corresponding row of array. """ for _ in range(100): sets = make_random_sets() chunks, arr = get_chunks_and_composition_array(sets) for (set_, row) in zip(sets, arr): chunks_in_this_set = [chunk for is_included, chunk in zip(row, chunks) if is_included] recreated_set = set.union(*chunks_in_this_set) self.assertEqual(set_, recreated_set) self.assertEqual(sum(len(chunk) for chunk in chunks_in_this_set), len(set_))
def test_chunks_for_random_sets(self): """ For random sets, test that 1) the union of sets is the disjoint union of chunks 2) each chunk is either completely inside any set either completely outside :return: """ for _ in range(100): sets = make_random_sets() chunks, arr = get_chunks_and_composition_array(sets) all_elements = set.union(*sets) self.assertEqual(set.union(*chunks), all_elements) self.assertEqual(sum(len(chunk) for chunk in chunks), len(all_elements)) for chunk, set_ in product(chunks, sets): self.assertTrue(not chunk - set_ or chunk - set_ == chunk)
def supervenn(sets, set_annotations=None, figsize=None, side_plots=True, chunks_ordering='minimize gaps', sets_ordering=None, reverse_chunks_order=True, reverse_sets_order=True, max_bruteforce_size=DEFAULT_MAX_BRUTEFORCE_SIZE, seeds=DEFAULT_SEEDS, noise_prob=DEFAULT_NOISE_PROB, side_plot_width=1, min_width_for_annotation=1, widths_minmax_ratio=None, side_plot_color='gray', dpi=None, ax=None, **kw): """ Plot a diagram visualizing relationship of multiple sets. :param sets: list of sets :param set_annotations: list of annotations for the sets :param figsize: figure size :param side_plots: True / False: add small barplots on top and on the right. On top, for each chunk it is shown, how many sets does this chunk lie inslde. On the right, set sizes are shown. :param chunks_ordering: method of ordering the chunks (columns of the grid) - 'minimize gaps' (default): use a smart algorithm to find an order of columns giving fewer gaps in each row, making the plot as readable as possible. - 'size': bigger chunks go first (or last if reverse_chunks_order=False) - 'occurence': chunks that are in most sets go first (or last if reverse_chunks_order=False) - 'random': randomly shuffle the columns :param sets_ordering: method of ordering the sets (rows of the grid) - None (default): keep the order as it is passed - 'minimize gaps': use a smart algorithm to find an order of rows giving fewer gaps in each column - 'size': bigger sets go first (or last if reverse_sets_order = False) - 'chunk count': sets that contain most chunks go first (or last if reverse_sets_order = False) - 'random': randomly shuffle :param reverse_chunks_order: True (default) / False when chunks_ordering is "size" or "occurence", chunks with bigger corresponding property go first if reverse_chunks_order=True, smaller go first if False. :param reverse_sets_order: True / False, works the same way as reverse_chunks_order :param max_bruteforce_size: maximal number of items for which bruteforce method is applied to find permutation :param seeds: number of different random seeds for the randomized greedy algorithm to find permutation :param noise_prob: probability of given element being equal to 1 in the noise array for randomized greedy algorithm :param side_plot_width: width of side plots in inches (default 1.5) :param side_plot_color: color of bars in side plots, default 'gray' :param dpi: figure DPI :param ax: axis to plot into. If ax is specified, figsize and dpi will be ignored. :param min_width_for_annotation: for horizontal plot, don't annotate bars of widths less than this value (to avoid clutter) :param widths_minmax_ratio: desired max/min ratio of displayed chunk widths, default None (show actual widths) :param rotate_col_annotations: True / False, whether to print annotations vertically :param fontsize: font size for all text elements :param row_annotations_y: a number in (0, 1), position for row annotations in the row. Default 0.5 - center of row. :param col_annotations_area_height: height of area for column annotations in inches, 1 by default :param col_annotations_ys_count: 1 (default), 2, or 3 - use to reduce clutter in column annotations area :param color_by: 'row' (default) or 'column'. If 'row', all cells in same row are same color, etc. :param bar_height: height of cell fill as a fraction of row height, a number in (0, 1). :param bar_alpha: alpha for cell fills. :param bar_align: vertical alignment of bars, 'edge' (default) or 'center'. Only matters when bar_height < 1. :param color_cycle: a list of set colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4') :return: SupervennPlot instance with attributes `axes`, `figure`, `chunks` and method `get_chunk(set_indices)`. See docstring to returned object. """ if figsize is not None or dpi is not None: warnings.warn( 'Parameters figsize and dpi of supervenn() are deprecated and will be removed in a future version.\n' 'Instead of this:\n' ' supervenn(sets, figsize=(8, 5), dpi=90)' '\nPlease either do this:\n' ' plt.figure(figsize=(8, 5), dpi=90)\n' ' supervenn(sets)\n' 'or plot into an existing axis by passing it as the ax argument:\n' ' supervenn(sets, ax=my_axis)\n' ) axes = setup_axes(side_plots, figsize, dpi, ax, side_plot_width) if set_annotations is None: set_annotations = ['Set_{}'.format(i) for i in range(len(sets))] chunks, composition_array = get_chunks_and_composition_array(sets) # Find permutations of rows and columns permutations_ = get_permutations( chunks, composition_array, chunks_ordering=chunks_ordering, sets_ordering=sets_ordering, reverse_chunks_order=reverse_chunks_order, reverse_sets_order=reverse_sets_order, max_bruteforce_size=max_bruteforce_size, seeds=seeds, noise_prob=noise_prob) # Apply permutations chunks = [chunks[i] for i in permutations_['chunks_ordering']] composition_array = composition_array[:, permutations_['chunks_ordering']] composition_array = composition_array[permutations_['sets_ordering'], :] set_annotations = [set_annotations[i] for i in permutations_['sets_ordering']] # Main plot chunk_sizes = [len(chunk) for chunk in chunks] if widths_minmax_ratio is not None: widths_balancer = get_widths_balancer(chunk_sizes, widths_minmax_ratio) col_widths = [widths_balancer(chunk_size) for chunk_size in chunk_sizes] effective_min_width_for_annotation = widths_balancer(min_width_for_annotation) else: col_widths = chunk_sizes effective_min_width_for_annotation = min_width_for_annotation plot_binary_array( arr=composition_array, row_annotations=set_annotations, col_annotations=chunk_sizes, ax=axes['main'], col_widths=col_widths, row_heights=[1] * len(sets), min_width_for_annotation=effective_min_width_for_annotation, **kw) xlim = axes['main'].get_xlim() ylim = axes['main'].get_ylim() plt.xlabel('ITEMS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE)) plt.ylabel('SETS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE)) # Side plots if side_plots: fontsize = kw.get('fontsize', DEFAULT_FONTSIZE) plt.sca(axes['top_side_plot']) side_plot(composition_array.sum(0), col_widths, 'h', min_width_for_annotation=effective_min_width_for_annotation, rotate_annotations=kw.get('rotate_col_annotations', False), color=side_plot_color, fontsize=fontsize) plt.xlim(xlim) plt.sca(axes['right_side_plot']) side_plot([len(sets[i]) for i in permutations_['sets_ordering']], [1] * len(sets), 'v', color=side_plot_color, fontsize=fontsize) plt.ylim(ylim) plt.sca(axes['main']) return SupervennPlot(axes, plt.gcf(), break_into_chunks(sets)) # todo: break_into_chunks is called twice, fix
def supervenn(sets, set_annotations=None, figsize=DEFAULT_FIGSIZE, side_plots=True, chunks_ordering='minimize gaps', sets_ordering=None, reverse_chunks_order=True, reverse_sets_order=True, max_bruteforce_size=DEFAULT_MAX_BRUTEFORCE_SIZE, seeds=DEFAULT_SEEDS, noise_prob=DEFAULT_NOISE_PROB, side_plot_width=1.5, min_width_for_annotation=1, widths_minmax_ratio=0, side_plot_color='gray', **kw): """ Plot a diagram visualizing relationship of multiple sets. :param sets: list of sets :param set_annotations: list of annotations for the sets :param figsize: figure size :param side_plots: True / False: add small barplots on top and on the right. On top, for each chunk it is shown, how many sets does this chunk lie inslde. On the right, set sizes are shown. :param chunks_ordering: method of ordering the chunks (columns of the grid) - 'minimize gaps' (default): use a smart algorithm to find an order of columns giving fewer gaps in each row, making the plot as readable as possible. - 'size': bigger chunks go first (or last if reverse_chunks_order=False) - 'occurence': chunks that are in most sets go first (or last if reverse_chunks_order=False) - 'random': randomly shuffle the columns :param sets_ordering: method of ordering the sets (rows of the grid) - None (default): keep the order as it is passed - 'minimize gaps': use a smart algorithm to find an order of rows giving fewer gaps in each column - 'size': bigger sets go first (or last if reverse_sets_order = False) - 'chunk count': sets that contain most chunks go first (or last if reverse_sets_order = False) - 'random': randomly shuffle :param reverse_chunks_order: True (default) / False when chunks_ordering is "size" or "occurence", chunks with bigger corresponding property go first if reverse_chunks_order=True, smaller go first if False. :param reverse_sets_order: True / False, works the same way as reverse_chunks_order :param max_bruteforce_size: maximal number of items for which bruteforce method is applied to find permutation :param seeds: number of different random seeds for the randomized greedy algorithm to find permutation :param noise_prob: probability of given element being equal to 1 in the noise array for randomized greedy algorithm :param side_plot_width: width of side plots in inches (default 1.5) :param side_plot_color: color of bars in side plots, default 'gray' :param min_width_for_annotation: for horizontal plot, don't annotate bars of widths less than this value (to avoid clutter) :param widths_minmax_ratio: desired max/min ratio of displayed chunk widths, default None (show actual widths) :param row_annotations_y: a number in (0, 1), position for row annotations in the row. Default 0.5 - center of row. :param col_annotations_area_height: height of area for column annotations in inches, 1 by default :param col_annotations_ys_count: 1 (default), 2, or 3 - use to reduce clutter in column annotations area :param color_by: 'row' (default) or 'column'. If 'row', all cells in same row are same color, etc. :param bar_height: height of cell fill as a fraction of row height, a number in (0, 1). :param bar_alpha: alpha for cell fills. :param bar_align: vertical alignment of bars, 'edge' (defaulr) or 'center'. Only matters when bar_height < 1. :param color_cycle: a list of colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4') """ # Set up axes if side_plots: fig_width, fig_height = figsize height_ratios = [side_plot_width, fig_height - side_plot_width] width_ratios = [fig_width - side_plot_width, side_plot_width] _, axes = plt.subplots(2, 2, figsize=figsize, gridspec_kw={ 'height_ratios': height_ratios, 'width_ratios': width_ratios, 'hspace': 0.0, 'wspace': 0.0 }) for side_ax in axes[0, 1], axes[1, 1], axes[0, 0]: side_ax.set_xticks([]) side_ax.set_yticks([]) main_ax = axes[1, 0] else: plt.figure(figsize=figsize) main_ax = plt.gca() if set_annotations is None: set_annotations = ['Set_{}'.format(i) for i in range(len(sets))] chunks, composition_array = get_chunks_and_composition_array(sets) # Find permutations of rows and columns permutations_ = get_permutations(chunks, composition_array, chunks_ordering=chunks_ordering, sets_ordering=sets_ordering, reverse_chunks_order=reverse_chunks_order, reverse_sets_order=reverse_sets_order, max_bruteforce_size=max_bruteforce_size, seeds=seeds, noise_prob=noise_prob) # Apply permutations chunks = [chunks[i] for i in permutations_['chunks_ordering']] composition_array = composition_array[:, permutations_['chunks_ordering']] composition_array = composition_array[permutations_['sets_ordering'], :] set_annotations = [ set_annotations[i] for i in permutations_['sets_ordering'] ] # Main plot chunk_sizes = [len(chunk) for chunk in chunks] col_widths = balance_widths( chunk_sizes, widths_minmax_ratio ) if widths_minmax_ratio is not None else chunk_sizes plot_binary_array(arr=composition_array, row_annotations=set_annotations, col_annotations=chunk_sizes, ax=main_ax, col_widths=col_widths, row_heights=[1] * len(sets), min_width_for_annotation=min_width_for_annotation, **kw) xlim = main_ax.get_xlim() ylim = main_ax.get_ylim() plt.xlabel('ITEMS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE)) plt.ylabel('SETS', fontsize=kw.get('fontsize', DEFAULT_FONTSIZE)) # Side plots if side_plots: plt.sca(axes[0, 0]) side_plot(composition_array.sum(0), col_widths, 'h', min_width_for_annotation=min_width_for_annotation, rotate_annotations=kw.get('rotate_col_annotations', False), color=side_plot_color) plt.xlim(xlim) plt.sca(axes[1, 1]) side_plot([len(sets[i]) for i in permutations_['sets_ordering']], [1] * len(sets), 'v', color=side_plot_color) plt.ylim(ylim) plt.sca(main_ax)
def test_disjoint_small(self): sets = [{1}, {2}] chunks, arr = get_chunks_and_composition_array(sets) self.assertEqual(freeze_sets(sets), freeze_sets(chunks)) self.assertTrue(np.array_equal(arr, np.eye(2, dtype=int))) # fixme can be not eye
def test_single_set(self): sets = [set(np.random.randint(1, 1000, size=100))] chunks, arr = get_chunks_and_composition_array(sets) self.assertEqual(chunks, sets) self.assertEqual(arr.shape, (1, 1)) self.assertEqual(arr[0, 0], 1)