def test_filter_subsets(min, max): data = generate_samples(0, 5000, 3) upset_data = UpSet(data, subset_size='auto') subset_upset_data = UpSet(data, subset_size='auto', min_subset_size=min, max_subset_size=max) intersections = upset_data.intersections df = upset_data._df subset_intersections = intersections[np.logical_and( intersections >= min, intersections <= max)] subset_df = df[df.index.isin(subset_intersections.index)] assert_series_equal(subset_upset_data.intersections, subset_intersections) def _pack_binary(X): X = pd.DataFrame(X) out = 0 for i, (_, col) in enumerate(X.items()): out *= 2 out += col return out subset_df_packed = _pack_binary(subset_df.index.to_frame()) subset_data_packed = _pack_binary(subset_intersections.index.to_frame()) subset_df['_bin'] = pd.Series(subset_df_packed).map( pd.Series(np.arange(len(subset_data_packed)), index=subset_data_packed)) assert_frame_equal(subset_upset_data._df, subset_df)
def Venn_Upset(adata,genelists,size_height=3): from upsetplot import UpSet from upsetplot import plot #gene lists can be ['Deep_1','Deep_2'] deepgenes=pd.DataFrame(adata.var[genelists+['highly_variable']]) deepgenes=deepgenes.set_index(genelists) upset = UpSet(deepgenes, subset_size='count', intersection_plot_elements=size_height) upset.plot() return upset
def UpSetFromLists(listOflist,labels,size_height=3,showplot=True): from upsetplot import UpSet listall=list(set([j for i in listOflist for j in i])) temp=pd.Series(listall,index=listall) temp2=pd.concat([temp.isin(i) for i in listOflist+[temp]],axis=1) temp2.columns=labels+['all'] temp2=temp2.set_index(labels) upset = UpSet(temp2,subset_size='count', intersection_plot_elements=3) if showplot is True: upset.plot() return upset
def test_index_must_be_bool(x): # Truthy ints are okay x = x.reset_index() x[['cat0', 'cat2', 'cat2']] = x[['cat0', 'cat1', 'cat2']].astype(int) x = x.set_index(['cat0', 'cat1', 'cat2']).iloc[:, 0] UpSet(x) # other ints are not x = x.reset_index() x[['cat0', 'cat2', 'cat2']] = x[['cat0', 'cat1', 'cat2']] + 1 x = x.set_index(['cat0', 'cat1', 'cat2']).iloc[:, 0] with pytest.raises(ValueError, match='not boolean'): UpSet(x)
def plot_upset(sets, path): if len(sets) > 1: df_upset = from_contents(sets) upset_plot = UpSet(df_upset, sort_by='degree', sort_categories_by='cardinality', show_counts=True, show_percentages=True) fig = plt.figure() upset_plot.plot(fig=fig) fig.savefig(path) elif len(sets) in {0, 1}: print(f'plot_upset: No sets to intersect for {path}')
def test_vertical(): X = generate_data(n_samples=100) fig = matplotlib.figure.Figure() UpSet(X, orientation='horizontal').make_grid(fig) horz_height = fig.get_figheight() horz_width = fig.get_figwidth() assert horz_height < horz_width fig = matplotlib.figure.Figure() UpSet(X, orientation='vertical').make_grid(fig) vert_height = fig.get_figheight() vert_width = fig.get_figwidth() assert horz_width / horz_height > vert_width / vert_height # TODO: test axes positions, plot order, bar orientation pass
def test_element_size(): X = generate_data(n_samples=100) figsizes = [] for element_size in range(10, 50, 5): fig = matplotlib.figure.Figure() UpSet(X, element_size=element_size).make_grid(fig) figsizes.append((fig.get_figwidth(), fig.get_figheight())) figwidths, figheights = zip(*figsizes) # Absolute width increases assert np.all(np.diff(figwidths) > 0) aspect = np.divide(figwidths, figheights) # Font size stays constant, so aspect ratio decreases assert np.all(np.diff(aspect) < 0) # But doesn't decrease by much assert np.all(aspect[:-1] / aspect[1:] < 1.1) fig = matplotlib.figure.Figure() figsize_before = fig.get_figwidth(), fig.get_figheight() UpSet(X, element_size=None).make_grid(fig) figsize_after = fig.get_figwidth(), fig.get_figheight() assert figsize_before == figsize_after
def test_sort_sets_by_deprecation(x, sort_sets_by): with pytest.warns(DeprecationWarning, match='sort_sets_by'): upset1 = UpSet(x, sort_sets_by=sort_sets_by) with pytest.warns(None): upset2 = UpSet(x, sort_categories_by=sort_sets_by) fig = matplotlib.figure.Figure() upset1.plot(fig) png1 = io.BytesIO() fig.savefig(png1, format='raw') fig = matplotlib.figure.Figure() upset2.plot(fig) png2 = io.BytesIO() fig.savefig(png2, format='raw') assert png1.getvalue() == png2.getvalue()
def plot_intersection(data: dict, plot_outfile: str = "upsetplot.pdf"): """ Take a dict of lists of unique identifiers, make quantitative venn diagram. Arguments: (REQUIRED) data: dict of lists, transformed with from_contents (OPTIONAL) plot_outfile: save the figure here """ data = UpSet(data, show_counts=True, show_percentages=True, sort_categories_by=None) data.plot() if plot_outfile: plt.savefig(plot_outfile) else: plt.show()
def upsetplot_miss(data): null_cols_df = data.loc[:, data.isnull().any()] missingness = pd.isna(null_cols_df).rename(columns=lambda x: x + '_NA') for i, col in enumerate(missingness.columns): null_cols_df = null_cols_df.set_index(missingness[col], append=i != 0) tuple_false_values = (False, ) * sum(data.isnull().any()) null_cols_only_miss_df = null_cols_df.loc[ null_cols_df.index != tuple_false_values, :] upset = UpSet(null_cols_only_miss_df, subset_size='count', show_counts=True, sort_by='cardinality') return (upset)
def lp_dist(data, percentage=False, scale=1, fname=None): """Plot pattern combination frequencies as an UpSet plot. Parameters ---------- data : AnnData Spatial formatted AnnData percentage : bool, optional If True, label each bar as a percentage else label as a count, by default False scale : int, optional scale > 1 scales the plot larger, scale < 1 scales. the plot smaller, by default 1 fname : str, optional Save the figure to specified filename, by default None """ sample_labels = [] for p in PATTERN_NAMES: p_df = data.to_df(p).reset_index().melt(id_vars="cell") p_df = p_df[~p_df["value"].isna()] p_df = p_df.set_index(["cell", "gene"]) sample_labels.append(p_df) sample_labels = pd.concat(sample_labels, axis=1) == 1 sample_labels = sample_labels == 1 sample_labels.columns = PATTERN_NAMES # Drop unlabeled samples # sample_labels = sample_labels[sample_labels.sum(axis=1) > 0] # Sort by degree, then pattern name sample_labels["degree"] = -sample_labels[PATTERN_NAMES].sum(axis=1) sample_labels = (sample_labels.reset_index().sort_values( ["degree"] + PATTERN_NAMES, ascending=False).drop("degree", axis=1)) upset = UpSet( from_indicators(PATTERN_NAMES, data=sample_labels), element_size=scale * 40, min_subset_size=sample_labels.shape[0] * 0.001, facecolor="lightgray", sort_by=None, show_counts=(not percentage), show_percentages=percentage, ) for p, color in zip(PATTERN_NAMES, PATTERN_COLORS): if sample_labels[p].sum() > 0: upset.style_subsets(present=p, max_degree=1, facecolor=color) upset.plot() plt.suptitle( f"Localization Patterns\n{data.n_obs} cells, {data.n_vars} genes")
def plotUpset(adata, study_col=None, ct_col=None, mn_key="MetaNeighborUS", metaclusters="MetaNeighborUS_1v1_metaclusters", outlier_label="outliers", show=True): """Plot UpSet plot for intersections between datasets and metaclusters Shows how replicability depends on hte input dataset Arguments: adata {AnnData} -- AnnData object containing the output of MetaNeighborUS 1vBest, and extractMetaClusters Keyword Arguments: study_col {[type]} -- If None, inferrefed from adata.uns[f'{mn_key}_params']['study_col'] else passed as vector (default: {None}) ct_col {[type]} -- If None, inferrefed from adata.uns[f'{mn_key}_params']['ct_col'] else passed as vector (default: {None}) mn_key {str} -- Location of MetaNeighborUS results (default: {'MetaNeighborUS'}) metaclusters {str} -- Location of extractMetaClusters results (default: {'MetaNeighborUS_1v1_metaclusters'}) outlier_label {str} -- Name of outlier_label in metaclusters (extractMetaClusters results) (default: {'outliers'}) show {bool} -- Flag for showing plot or return UpSet object (default: {True}) """ if study_col is None: study_col = adata.uns[f"{mn_key}_params"]["study_col"] else: assert study_col in adata.obs_keys(), "Study Col not in adata" if ct_col is None: ct_col = adata.uns[f"{mn_key}_params"]["ct_col"] else: assert ct_col in adata.obs_keys(), "Cluster Col not in adata" if type(metaclusters) is str: assert (metaclusters in adata.uns_keys() ), "Run extractMetaClusters or pass Metacluster Series" metaclusters = adata.uns[metaclusters] pheno, _, _ = create_cell_labels(adata, study_col, ct_col) pheno = pheno.drop_duplicates().set_index("study_ct") get_studies = lambda x: pheno.loc[x, study_col].values.tolist() studies = [get_studies(x) for x in metaclusters.values] membership = dict(zip(metaclusters.index, studies)) df = pd.DataFrame( [{name: True for name in names} for names in membership.values()], index=membership.keys(), ) df = df.fillna(False) df = df[df.index != outlier_label] df = df.groupby(df.columns.tolist(), as_index=False).size() if type( df ) is not pd.Series: #For pandas versions <1.0.0 size returns the correct series cols = df.columns[:-1].copy() for col in cols: df.set_index(df[col], append=True, inplace=True) df.index = df.index.droplevel(0) df = df["size"] us = UpSet(df, sort_by="cardinality") if show: plt.show() else: return us
def test_add_catplot(): pytest.importorskip('seaborn') X = generate_data(n_samples=100) upset = UpSet(X) # smoke test upset.add_catplot('violin') fig = matplotlib.figure.Figure() upset.plot(fig) # can't provide value with Series with pytest.raises(ValueError): upset.add_catplot('violin', value='foo') # check the above add_catplot did not break the state upset.plot(fig) X = generate_data(n_samples=100) X.name = 'foo' X = X.to_frame() upset = UpSet(X, sum_over=False) # must provide value with DataFrame with pytest.raises(ValueError): upset.add_catplot('violin') upset.add_catplot('violin', value='foo') with pytest.raises(ValueError): # not a known column upset.add_catplot('violin', value='bar') upset.plot(fig) # invalid plot kind raises error when plotting upset.add_catplot('foobar', value='foo') with pytest.raises(AttributeError): upset.plot(fig)
from matplotlib import pyplot as plt from upsetplot import UpSet # Load the dataset into a DataFrame boston = load_boston() boston_df = pd.DataFrame(boston.data, columns=boston.feature_names) # Get five features most correlated with median house value correls = boston_df.corrwith(pd.Series(boston.target), method='spearman').sort_values() top_features = correls.index[-5:] # Get a binary indicator of whether each top feature is above average boston_above_avg = boston_df > boston_df.median(axis=0) boston_above_avg = boston_above_avg[top_features] boston_above_avg = boston_above_avg.rename(columns=lambda x: x + '>') # Make this indicator mask an index of boston_df boston_df = pd.concat([boston_df, boston_above_avg], axis=1) boston_df = boston_df.set_index(list(boston_above_avg.columns)) # Also give us access to the target (median house value) boston_df = boston_df.assign(median_value=boston.target) # UpSet plot it! upset = UpSet(boston_df, sum_over=False, intersection_plot_elements=3) upset.add_catplot(value='median_value', kind='strip', color='blue') upset.add_catplot(value='AGE', kind='strip', color='black') upset.plot() plt.show()
def test_param_validation(kw): X = generate_data(n_samples=100) with pytest.raises(ValueError): UpSet(X, **kw)
x_df_3_binary = x_df_3_binary.set_index(selected_genes) y_ind = y > 0 x_df_mets_3 = x_df_3.T[y_ind].T x_df_mets_3_binary = x_df_mets_3.T > 0. print x_df_mets_3_binary.shape x_df_mets_3_binary = x_df_mets_3_binary.set_index(selected_genes) font = {'family': 'Arial', 'weight': 'normal', 'size': 5} matplotlib.rc('font', **font) dd = x_df_3_binary.reset_index().set_index(['AR', 'TP53', 'MDM4']) upset = UpSet(dd, subset_size='count', intersection_plot_elements=6, show_counts=True, with_lines=True, element_size=10) fig = plt.figure(constrained_layout=False, figsize=(8, 6)) upset.plot(fig) fig.subplots_adjust(bottom=0.2, top=0.9, left=0.08, right=0.99) saving_dir = join(PLOTS_PATH, 'figure4') filename = join(saving_dir, 'figure4_ar_tp53_mdm4.png') plt.savefig(filename, dpi=300) matplotlib.rcParams['pdf.fonttype'] = 42 filename = join(saving_dir, 'figure4_ar_tp53_mdm4.pdf') plt.savefig(filename)
top_features = correls.index[-5:] # Get a binary indicator of whether each top feature is above average boston_above_avg = boston_df > boston_df.median(axis=0) boston_above_avg = boston_above_avg[top_features] boston_above_avg = boston_above_avg.rename(columns=lambda x: x + '>') # Make this indicator mask an index of boston_df boston_df = pd.concat([boston_df, boston_above_avg], axis=1) boston_df = boston_df.set_index(list(boston_above_avg.columns)) # Also give us access to the target (median house value) boston_df = boston_df.assign(median_value=boston.target) # UpSet plot it! upset = UpSet(boston_df, subset_size='count', intersection_plot_elements=3) upset.add_catplot(value='median_value', kind='strip', color='blue') upset.add_catplot(value='AGE', kind='strip', color='black') upset.plot() plt.title("UpSet with catplots, for orientation='horizontal'") plt.show() # And again in vertical orientation upset = UpSet(boston_df, subset_size='count', intersection_plot_elements=3, orientation='vertical') upset.add_catplot(value='median_value', kind='strip', color='blue') upset.add_catplot(value='AGE', kind='strip', color='black') upset.plot()
def gen_upset_plot(self, className=None): # total_peps = len([pep for s in self.results.samples for pep in s.peptides]) total_peps = np.sum([len(s.peptides) for s in self.results.samples]) data = from_contents({s.sample_name: set(s.peptides) for s in self.results.samples}) for intersection in data.index.unique(): if len(data.loc[intersection, :])/total_peps < 0.005: data.drop(index=intersection, inplace=True) data['peptide_length'] = np.vectorize(len)(data['id']) n_sets = len(data.index.unique()) if n_sets <= 100: # Plot horizontal upset = UpSet(data, sort_by='cardinality', #sort_categories_by=None, show_counts=True,) #totals_plot_elements=4, #intersection_plot_elements=10) upset.add_catplot(value='peptide_length', kind='boxen', color='gray') plot = upset.plot() plot['totals'].grid(False) ylim = plot['intersections'].get_ylim()[1] plot['intersections'].set_ylim((0, ylim * 1.1)) for c in plot['intersections'].get_children(): if isinstance(c, plotText): text = c.get_text() text = text.replace('\n', ' ') c.set_text(text) c.set_rotation('vertical') pos = c.get_position() pos = (pos[0], pos[1] + 0.02 * ylim) c.set_position(pos) else: # plot vertical upset = UpSet(data, subset_size='count', orientation='vertical', sort_by='cardinality', sort_categories_by=None, show_counts=True) upset.add_catplot(value='peptide_length', kind='boxen', color='gray') plot = upset.plot() lim = plot['intersections'].get_xlim() plot['intersections'].set_xlim([0, lim[1] * 1.6]) plot['totals'].grid(False) ylim = plot['totals'].get_ylim()[1] for c in plot['totals'].get_children(): if isinstance(c, plotText): text = c.get_text() text = text.replace('\n', ' ') c.set_text(text) c.set_rotation('vertical') pos = c.get_position() pos = (pos[0], pos[1] + 0.1 * ylim) c.set_position(pos) plt.draw() upset_fig = f'{self.fig_dir / "upsetplot.svg"}' plt.savefig(upset_fig, bbox_inches="tight") encoded_upset_fig = base64.b64encode(open(upset_fig, 'rb').read()).decode() card = div(className='card', style="height: 100%") card.add(div([b('UpSet Plot'), p('Only intersections > 0.5% are displayed')], className='card-header')) plot_body = div(img(src=f'data:image/svg+xml;base64,{encoded_upset_fig}', className='img-fluid', style=f'width: 100%; height: auto'), className='card-body') card.add(plot_body) return div(card, className=className)