def _plot_deviation(self, true_values: np.ndarray, estim_values: np.ndarray, save=None, show=True, title=None, return_axs=False): """ Plot estimated coefficients against reference (true) coefficients. :param true_values: :param estim_values: :param save: Path+file name stem to save plots to. File will be save+"_genes.png". Does not save if save is None. :param show: Whether to display plot. :param return_axs: Whether to return axis objects. :return: Matplotlib axis objects. """ import seaborn as sns import matplotlib.pyplot as plt if isinstance(true_values, dask.array.core.Array): true_values = true_values.compute() if isinstance(estim_values, dask.array.core.Array): estim_values = estim_values.compute() plt.ioff() n_par = true_values.shape[0] summary_fit = pd.concat([ pd.DataFrame({ "deviation": estim_values[i, :] - true_values[i, :], "coefficient": pd.Series( ["coef_" + str(i) for x in range(estim_values.shape[1])], dtype="category") }) for i in range(n_par) ]) summary_fit['coefficient'] = summary_fit['coefficient'].astype( "category") fig, ax = plt.subplots() sns.violinplot(x=summary_fit["coefficient"], y=summary_fit["deviation"], ax=ax) if title is not None: ax.set_title(title) # Save, show and return figure. if save is not None: plt.savefig(save + '_deviation_violin.png') if show: plt.show() plt.close(fig) plt.ion() if return_axs: return ax else: return
def groupwise_solve_lm(dmat, apply_fun: callable, constraints: np.ndarray): r""" Solve GLMs by estimating the distribution parameters of each unique group of observations independently and solving then for the design matrix `dmat`. Idea: $$ \theta &= f(x) \\ \Rightarrow f^{-1}(\theta) &= x \\ &= (D \cdot D^{+}) \cdot x \\ &= D \cdot (D^{+} \cdot x) \\ &= D \cdot x' = f^{-1}(\theta) $$ :param dmat: design matrix which should be solved for :param apply_fun: some callable function taking one xr.DataArray argument. Should compute a group-wise parameter solution. Example method calculating group-wise means: :: def apply_fun(grouping): groupwise_means = data.groupby(grouping).mean(dim="observations").values return np.log(groupwise_means) The `data` argument provided to `apply_fun` is the same xr.DataArray provided to this :param constraints: tensor (all parameters x dependent parameters) Tensor that encodes how complete parameter set which includes dependent parameters arises from indepedent parameters: all = <constraints, indep>. This form of constraints is used in vector generalized linear models (VGLMs). :return: tuple of (apply_fun(grouping), x_prime, rmsd, rank, s) where x_prime is the parameter matrix solved for `dmat`. """ # Get unqiue rows of design matrix and vector with group assignments: if isinstance(dmat, dask.array.core.Array ): # axis argument not supported by dask in .unique() unique_design, inverse_idx = np.unique(dmat.compute(), axis=0, return_inverse=True) unique_design = dask.array.from_array(unique_design, chunks=unique_design.shape) else: unique_design, inverse_idx = np.unique(dmat, axis=0, return_inverse=True) if unique_design.shape[0] > 500: raise ValueError( "large least-square problem in init, likely defined a numeric predictor as categorical" ) full_rank = constraints.shape[1] if isinstance(dmat, dask.array.core.Array): # matrix_rank not supported by dask rank = np.linalg.matrix_rank( np.matmul(unique_design.compute(), constraints.compute())) else: rank = np.linalg.matrix_rank(np.matmul(unique_design, constraints)) if full_rank > rank: logger.error("model is not full rank!") # Get group-wise means in linker space based on group assignments # based on unique rows of design matrix: params = apply_fun(inverse_idx) # Use least-squares solver to compute model parameterization # accounting for dependent parameters, ie. degrees of freedom # of the model which appear as groups in the design matrix # and are not accounted for by parameters but which are # accounted for by constraints: # <X, <theta, H> = means -> <X, theta>, H> = means -> lstsqs for theta # (This is faster and more accurate than using matrix inversion.) logger.debug(" ** Solve lstsq problem") if np.any(np.isnan(params)): raise Warning( "entries of params were nan which will throw error in lstsq") x_prime, rmsd, rank, s = np.linalg.lstsq( np.matmul(unique_design, constraints), params) return params, x_prime, rmsd, rank, s
def _plot_coef_vs_ref(self, true_values: np.ndarray, estim_values: np.ndarray, size=1, log=False, save=None, show=True, ncols=5, row_gap=0.3, col_gap=0.25, title=None, return_axs=False): """ Plot estimated coefficients against reference (true) coefficients. :param true_values: :param estim_values: :param size: Point size. :param save: Path+file name stem to save plots to. File will be save+"_genes.png". Does not save if save is None. :param show: Whether to display plot. :param ncols: Number of columns in plot grid if multiple genes are plotted. :param row_gap: Vertical gap between panel rows relative to panel height. :param col_gap: Horizontal gap between panel columns relative to panel width. :param title: Plot title. :param return_axs: Whether to return axis objects. :return: Matplotlib axis objects. """ import seaborn as sns import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams if isinstance(true_values, dask.array.core.Array): true_values = true_values.compute() if isinstance(estim_values, dask.array.core.Array): estim_values = estim_values.compute() plt.ioff() n_par = true_values.shape[0] ncols = ncols if n_par > ncols else n_par nrows = n_par // ncols + (n_par - (n_par // ncols) * ncols) gs = gridspec.GridSpec(nrows=nrows, ncols=ncols, hspace=row_gap, wspace=col_gap) fig = plt.figure(figsize=( ncols * rcParams['figure.figsize'][0], # width in inches nrows * rcParams['figure.figsize'][1] * (1 + row_gap) # height in inches )) if title is None: title = "parameter" # Build axis objects in loop. axs = [] for i in range(n_par): ax = plt.subplot(gs[i]) axs.append(ax) x = true_values[i, :] y = estim_values[i, :] if log: x = np.log(x + 1) y = np.log(y + 1) sns.scatterplot(x=x, y=y, size=size, ax=ax, legend=False) sns.lineplot(x=np.array([ np.min([np.min(x), np.min(y)]), np.max([np.max(x), np.max(y)]) ]), y=np.array([ np.min([np.min(x), np.min(y)]), np.max([np.max(x), np.max(y)]) ]), ax=ax) title_i = title + "_" + str(i) # Add correlation into title: title_i = title_i + " (R=" + str( np.round(np.corrcoef(x, y)[0, 1], 3)) + ")" ax.set_title(title_i) ax.set_xlabel("true parameter") ax.set_ylabel("estimated parameter") # Save, show and return figure. if save is not None: plt.savefig(save + '_parameter_scatter.png') if show: plt.show() plt.close(fig) plt.ion() if return_axs: return axs else: return