Example #1
0
    def _plot_deviation(self,
                        true_values: np.ndarray,
                        estim_values: np.ndarray,
                        save=None,
                        show=True,
                        title=None,
                        return_axs=False):
        """
        Plot estimated coefficients against reference (true) coefficients.

        :param true_values:
        :param estim_values:
        :param save: Path+file name stem to save plots to.
            File will be save+"_genes.png". Does not save if save is None.
        :param show: Whether to display plot.
        :param return_axs: Whether to return axis objects.
        :return: Matplotlib axis objects.
        """
        import seaborn as sns
        import matplotlib.pyplot as plt

        if isinstance(true_values, dask.array.core.Array):
            true_values = true_values.compute()
        if isinstance(estim_values, dask.array.core.Array):
            estim_values = estim_values.compute()

        plt.ioff()

        n_par = true_values.shape[0]
        summary_fit = pd.concat([
            pd.DataFrame({
                "deviation":
                estim_values[i, :] - true_values[i, :],
                "coefficient":
                pd.Series(
                    ["coef_" + str(i) for x in range(estim_values.shape[1])],
                    dtype="category")
            }) for i in range(n_par)
        ])
        summary_fit['coefficient'] = summary_fit['coefficient'].astype(
            "category")

        fig, ax = plt.subplots()
        sns.violinplot(x=summary_fit["coefficient"],
                       y=summary_fit["deviation"],
                       ax=ax)

        if title is not None:
            ax.set_title(title)

        # Save, show and return figure.
        if save is not None:
            plt.savefig(save + '_deviation_violin.png')

        if show:
            plt.show()

        plt.close(fig)
        plt.ion()

        if return_axs:
            return ax
        else:
            return
Example #2
0
def groupwise_solve_lm(dmat, apply_fun: callable, constraints: np.ndarray):
    r"""
    Solve GLMs by estimating the distribution parameters of each unique group of observations independently and
    solving then for the design matrix `dmat`.

    Idea:
    $$
        \theta &= f(x) \\
        \Rightarrow f^{-1}(\theta) &= x \\
            &= (D \cdot D^{+}) \cdot x \\
            &= D \cdot (D^{+} \cdot x) \\
            &= D \cdot x' = f^{-1}(\theta)
    $$

    :param dmat: design matrix which should be solved for
    :param apply_fun: some callable function taking one xr.DataArray argument.
        Should compute a group-wise parameter solution.

        Example method calculating group-wise means:
        ::
            def apply_fun(grouping):
                groupwise_means = data.groupby(grouping).mean(dim="observations").values

                return np.log(groupwise_means)

        The `data` argument provided to `apply_fun` is the same xr.DataArray provided to this
    :param constraints: tensor (all parameters x dependent parameters)
        Tensor that encodes how complete parameter set which includes dependent
        parameters arises from indepedent parameters: all = <constraints, indep>.
        This form of constraints is used in vector generalized linear models (VGLMs).

    :return: tuple of (apply_fun(grouping), x_prime, rmsd, rank, s) where x_prime is the parameter matrix solved for
    `dmat`.
    """
    # Get unqiue rows of design matrix and vector with group assignments:
    if isinstance(dmat, dask.array.core.Array
                  ):  # axis argument not supported by dask in .unique()
        unique_design, inverse_idx = np.unique(dmat.compute(),
                                               axis=0,
                                               return_inverse=True)
        unique_design = dask.array.from_array(unique_design,
                                              chunks=unique_design.shape)
    else:
        unique_design, inverse_idx = np.unique(dmat,
                                               axis=0,
                                               return_inverse=True)
    if unique_design.shape[0] > 500:
        raise ValueError(
            "large least-square problem in init, likely defined a numeric predictor as categorical"
        )

    full_rank = constraints.shape[1]
    if isinstance(dmat,
                  dask.array.core.Array):  # matrix_rank not supported by dask
        rank = np.linalg.matrix_rank(
            np.matmul(unique_design.compute(), constraints.compute()))
    else:
        rank = np.linalg.matrix_rank(np.matmul(unique_design, constraints))
    if full_rank > rank:
        logger.error("model is not full rank!")

    # Get group-wise means in linker space based on group assignments
    # based on unique rows of design matrix:
    params = apply_fun(inverse_idx)

    # Use least-squares solver to compute model parameterization
    # accounting for dependent parameters, ie. degrees of freedom
    # of the model which appear as groups in the design matrix
    # and are not accounted for by parameters but which are
    # accounted for by constraints:
    # <X, <theta, H> = means -> <X, theta>, H> = means -> lstsqs for theta
    # (This is faster and more accurate than using matrix inversion.)
    logger.debug(" ** Solve lstsq problem")
    if np.any(np.isnan(params)):
        raise Warning(
            "entries of params were nan which will throw error in lstsq")
    x_prime, rmsd, rank, s = np.linalg.lstsq(
        np.matmul(unique_design, constraints), params)

    return params, x_prime, rmsd, rank, s
Example #3
0
    def _plot_coef_vs_ref(self,
                          true_values: np.ndarray,
                          estim_values: np.ndarray,
                          size=1,
                          log=False,
                          save=None,
                          show=True,
                          ncols=5,
                          row_gap=0.3,
                          col_gap=0.25,
                          title=None,
                          return_axs=False):
        """
        Plot estimated coefficients against reference (true) coefficients.

        :param true_values:
        :param estim_values:
        :param size: Point size.
        :param save: Path+file name stem to save plots to.
            File will be save+"_genes.png". Does not save if save is None.
        :param show: Whether to display plot.
        :param ncols: Number of columns in plot grid if multiple genes are plotted.
        :param row_gap: Vertical gap between panel rows relative to panel height.
        :param col_gap: Horizontal gap between panel columns relative to panel width.
        :param title: Plot title.
        :param return_axs: Whether to return axis objects.
        :return: Matplotlib axis objects.
        """
        import seaborn as sns
        import matplotlib.pyplot as plt
        from matplotlib import gridspec
        from matplotlib import rcParams

        if isinstance(true_values, dask.array.core.Array):
            true_values = true_values.compute()
        if isinstance(estim_values, dask.array.core.Array):
            estim_values = estim_values.compute()

        plt.ioff()

        n_par = true_values.shape[0]
        ncols = ncols if n_par > ncols else n_par
        nrows = n_par // ncols + (n_par - (n_par // ncols) * ncols)

        gs = gridspec.GridSpec(nrows=nrows,
                               ncols=ncols,
                               hspace=row_gap,
                               wspace=col_gap)

        fig = plt.figure(figsize=(
            ncols * rcParams['figure.figsize'][0],  # width in inches
            nrows * rcParams['figure.figsize'][1] *
            (1 + row_gap)  # height in inches
        ))

        if title is None:
            title = "parameter"

        # Build axis objects in loop.
        axs = []
        for i in range(n_par):
            ax = plt.subplot(gs[i])
            axs.append(ax)

            x = true_values[i, :]
            y = estim_values[i, :]
            if log:
                x = np.log(x + 1)
                y = np.log(y + 1)

            sns.scatterplot(x=x, y=y, size=size, ax=ax, legend=False)
            sns.lineplot(x=np.array([
                np.min([np.min(x), np.min(y)]),
                np.max([np.max(x), np.max(y)])
            ]),
                         y=np.array([
                             np.min([np.min(x), np.min(y)]),
                             np.max([np.max(x), np.max(y)])
                         ]),
                         ax=ax)

            title_i = title + "_" + str(i)
            # Add correlation into title:
            title_i = title_i + " (R=" + str(
                np.round(np.corrcoef(x, y)[0, 1], 3)) + ")"
            ax.set_title(title_i)
            ax.set_xlabel("true parameter")
            ax.set_ylabel("estimated parameter")

        # Save, show and return figure.
        if save is not None:
            plt.savefig(save + '_parameter_scatter.png')

        if show:
            plt.show()

        plt.close(fig)
        plt.ion()

        if return_axs:
            return axs
        else:
            return