def dotplot(df, cutoff=0.05, figsize=(3.5, 6), top_term=10, scale=1): """Visualize enrichr results. :param df: GSEApy DataFrame results. :param cutoff: p-adjust cut-off. :param top_term: number of enriched terms to show. :param scale: dotplot point size scale. :return: a dotplot for enrichr terms. """ if 'fdr' in df.columns: #gsea results df.rename(columns={ 'fdr': 'Adjusted P-value', }, inplace=True) df['hits_ratio'] = df['matched_size'] / df['gene_set_size'] else: #enrichr results df['Count'] = df['Overlap'].str.split("/").str[0].astype(int) df['Background'] = df['Overlap'].str.split("/").str[1].astype(int) df['hits_ratio'] = df['Count'] / df['Background'] # pvalue cut off df = df[df['Adjusted P-value'] <= cutoff] if len(df) < 1: logging.warning("Warning: No enrich terms when cuttoff = %s" % cutoff) return None #sorting the dataframe for better visualization df = df.sort_values(by='Adjusted P-value', ascending=False) df = df.head(top_term) # x axis values padj = df['Adjusted P-value'] combined_score = df['Combined Score'].round().astype('int') x = -padj.apply(np.log10) # y axis index and values y = [i for i in range(0, len(df))] labels = df.Term.values area = np.pi * (df['Count'] * scale)**2 #creat scatter plot if hasattr(sys, 'ps1'): #working inside python console, show figure fig, ax = plt.subplots(figsize=figsize) else: #If working on commandline, don't show figure fig = Figure(figsize=figsize) canvas = FigureCanvas(fig) ax = fig.add_subplot(111) vmin = np.percentile(combined_score.min(), 2) vmax = np.percentile(combined_score.max(), 98) sc = ax.scatter(x=x, y=y, s=area, edgecolors='face', c=combined_score, cmap=plt.cm.RdBu, vmin=vmin, vmax=vmax) ax.set_xlabel("-log$_{10}$(Adjust P-value)", fontsize=16) ax.yaxis.set_major_locator(plt.FixedLocator(y)) ax.yaxis.set_major_formatter(plt.FixedFormatter(labels)) ax.set_yticklabels(labels, fontsize=16) #ax.set_ylim([-1, len(df)]) ax.grid() #colorbar cax = fig.add_axes([0.93, 0.20, 0.07, 0.22]) cbar = fig.colorbar( sc, cax=cax, ) cbar.ax.tick_params(right='off') cbar.ax.set_title('Com-\nscore', loc='left', fontsize=12) #for terms less than 3 if len(df) >= 3: # find the index of the closest value to the median idx = [ area.argmax(), np.abs(area - area.mean()).argmin(), area.argmin() ] idx = unique(idx) x2 = [0] * len(idx) else: x2 = [0] * len(df) idx = df.index #scale of dots ax2 = fig.add_axes([0.93, 0.55, 0.09, 0.06 * len(idx)]) #s=area[idx] l1 = ax2.scatter([], [], s=10, edgecolors='none') l2 = ax2.scatter([], [], s=50, edgecolors='none') l3 = ax2.scatter([], [], s=100, edgecolors='none') labels = df['Count'][idx] leg = ax.legend([l1, l2, l3], labels, nrow=3, frameon=True, fontsize=12, handlelength=2, loc=8, borderpad=1.8, handletextpad=1, title='Gene\nRatio', scatterpoints=1) #canvas.print_figure('test', bbox_inches='tight') return fig
def dotplot(df, column='Adjusted P-value', title='', cutoff=0.05, top_term=10, sizes=None, norm=None, legend=True, figsize=(6, 5.5), cmap='RdBu_r', ofname=None, **kwargs): """Visualize enrichr results. :param df: GSEApy DataFrame results. :param column: which column of DataFrame to show. Default: Adjusted P-value :param title: figure title :param cutoff: terms with 'column' value < cut-off are shown. :param top_term: number of enriched terms to show. :param ascending: bool, the order of y axis. :param sizes: tuple, (min, max) scatter size. Not functional for now :param norm: maplotlib.colors.Normalize object. :param legend: bool, whether to show legend. :param figsize: tuple, figure size. :param cmap: matplotlib colormap :param ofname: output file name. If None, don't save figure """ colname = column # sorting the dataframe for better visualization if colname in ['Adjusted P-value', 'P-value']: # check if any values in `df[colname]` can't be coerced to floats can_be_coerced = df[colname].map(isfloat) if np.sum(~can_be_coerced) > 0: raise ValueError( 'some value in %s could not be typecast to `float`' % colname) else: df.loc[:, colname] = df[colname].map(float) df = df[df[colname] <= cutoff] if len(df) < 1: msg = "Warning: No enrich terms when cutoff = %s" % cutoff return msg df = df.assign(logAP=lambda x: -x[colname].apply(np.log10)) colname = 'logAP' df = df.sort_values(by=colname).iloc[-top_term:, :] # temp = df['Overlap'].str.split("/", expand=True).astype(int) df = df.assign(Hits=temp.iloc[:, 0], Background=temp.iloc[:, 1]) df = df.assign(Hits_ratio=lambda x: x.Hits / x.Background) # x axis values x = df.loc[:, colname].values combined_score = df['Combined Score'].round().astype('int') # y axis index and values y = [i for i in range(0, len(df))] ylabels = df['Term'].values # Normalise to [0,1] # b = (df['Count'] - df['Count'].min())/ np.ptp(df['Count']) # area = 100 * b # control the size of scatter and legend marker levels = numbers = np.sort(df.Hits.unique()) if norm is None: norm = Normalize() elif isinstance(norm, tuple): norm = Normalize(*norm) elif not isinstance(norm, Normalize): err = ("``size_norm`` must be None, tuple, " "or Normalize object.") raise ValueError(err) min_width, max_width = np.r_[20, 100] * plt.rcParams["lines.linewidth"] norm.clip = True if not norm.scaled(): norm(np.asarray(numbers)) size_limits = norm.vmin, norm.vmax scl = norm(numbers) widths = np.asarray(min_width + scl * (max_width - min_width)) if scl.mask.any(): widths[scl.mask] = 0 sizes = dict(zip(levels, widths)) df['sizes'] = df.Hits.map(sizes) area = df['sizes'].values # create scatter plot if hasattr(sys, 'ps1') and (ofname is None): # working inside python console, show figure fig, ax = plt.subplots(figsize=figsize) else: # If working on commandline, don't show figure fig = Figure(figsize=figsize) canvas = FigureCanvas(fig) ax = fig.add_subplot(111) vmin = np.percentile(combined_score.min(), 2) vmax = np.percentile(combined_score.max(), 98) sc = ax.scatter(x=x, y=y, s=area, edgecolors='face', c=combined_score, cmap=cmap, vmin=vmin, vmax=vmax) if column in ['Adjusted P-value', 'P-value']: xlabel = "-log$_{10}$(%s)" % column else: xlabel = column ax.set_xlabel(xlabel, fontsize=14, fontweight='bold') ax.yaxis.set_major_locator(plt.FixedLocator(y)) ax.yaxis.set_major_formatter(plt.FixedFormatter(ylabels)) ax.set_yticklabels(ylabels, fontsize=16) # ax.set_ylim([-1, len(df)]) ax.grid() # colorbar cax = fig.add_axes([0.95, 0.20, 0.03, 0.22]) cbar = fig.colorbar( sc, cax=cax, ) cbar.ax.tick_params(right=True) cbar.ax.set_title('Combined\nScore', loc='left', fontsize=12) # for terms less than 3 if len(df) >= 3: # find the index of the closest value to the median idx = [ area.argmax(), np.abs(area - area.mean()).argmin(), area.argmin() ] idx = unique(idx) else: idx = range(len(df)) label = df.iloc[idx, df.columns.get_loc('Hits')] if legend: handles, _ = ax.get_legend_handles_labels() legend_markers = [] for ix in idx: legend_markers.append(ax.scatter([], [], s=area[ix], c='b')) # artist = ax.scatter([], [], s=size_levels,) ax.legend(legend_markers, label, title='Hits') ax.set_title(title, fontsize=20, fontweight='bold') if ofname is not None: # canvas.print_figure(ofname, bbox_inches='tight', dpi=300) fig.savefig(ofname, bbox_inches='tight', dpi=300) return return ax