Exemple #1
0
    def from_formula(cls, formula, data, window, weights=None, subset=None,
                     *args, **kwargs):
        if subset is not None:
            data = data.loc[subset]
        eval_env = kwargs.pop('eval_env', None)
        if eval_env is None:
            eval_env = 2
        elif eval_env == -1:
            from patsy import EvalEnvironment
            eval_env = EvalEnvironment({})
        else:
            eval_env += 1  # we're going down the stack again
        missing = kwargs.get('missing', 'skip')
        from patsy import dmatrices, NAAction
        na_action = NAAction(on_NA='raise', NA_types=[])
        result = dmatrices(formula, data, eval_env, return_type='dataframe',
                           NA_action=na_action)

        endog, exog = result
        if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2:
            raise ValueError('endog has evaluated to an array with multiple '
                             'columns that has shape {0}. This occurs when '
                             'the variable converted to endog is non-numeric'
                             ' (e.g., bool or str).'.format(endog.shape))

        kwargs.update({'missing': missing,
                       'window': window})
        if weights is not None:
            kwargs['weights'] = weights
        mod = cls(endog, exog, *args, **kwargs)
        mod.formula = formula
        # since we got a dataframe, attach the original
        mod.data.frame = data
        return mod
    def __init__(
            self,  # Observations
            formula,
            data,
            # NA action
            NA_action="drop",
            # Environment
            eval_env=0,
            # Number of cores
            n_estimators=500,
            n_jobs=1,
            **kwargs):
        """Function to fit a random forest model.

        The function fits a random forest model using patsy formula.
        """

        # Model specifications
        self.model_type = "random_forest"
        self.formula = formula
        self.data = data

        # Patsy
        eval_env = EvalEnvironment.capture(eval_env, reference=1)
        y, x = dmatrices(formula, data, eval_env, NA_action)
        self._y_design_info = y.design_info
        self._x_design_info = x.design_info

        # Create and train Random Forest
        rf = RandomForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    **kwargs)
        rf.fit(x, y)
        self.rf = rf
Exemple #3
0
 def rerp(
         self,
         # rERPRequest arguments
         event_query,
         start_time,
         stop_time,
         formula,
         name=None,
         eval_env=0,
         bad_event_query=None,
         all_or_nothing=False,
         # multi_rerp arguments
         artifact_query="has _ARTIFACT_TYPE",
         artifact_type_field="_ARTIFACT_TYPE",
         overlap_correction=True,
         regression_strategy="auto"):
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     request = rERPRequest(
         event_query,
         start_time,
         stop_time,
         formula,
         name=name,
         eval_env=eval_env,
         bad_event_query=bad_event_query,
         all_or_nothing=all_or_nothing)
     rerps = self.multi_rerp([request],
                             artifact_query=artifact_query,
                             artifact_type_field=artifact_type_field,
                             overlap_correction=overlap_correction,
                             regression_strategy=regression_strategy)
     assert len(rerps) == 1
     return rerps[0]
Exemple #4
0
 def rerp(
         self,
         # rERPRequest arguments
         event_query,
         start_time,
         stop_time,
         formula,
         name=None,
         eval_env=0,
         bad_event_query=None,
         all_or_nothing=False,
         # multi_rerp arguments
         artifact_query="has _ARTIFACT_TYPE",
         artifact_type_field="_ARTIFACT_TYPE",
         overlap_correction=True,
         regression_strategy="auto"):
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     request = rERPRequest(event_query,
                           start_time,
                           stop_time,
                           formula,
                           name=name,
                           eval_env=eval_env,
                           bad_event_query=bad_event_query,
                           all_or_nothing=all_or_nothing)
     rerps = self.multi_rerp([request],
                             artifact_query=artifact_query,
                             artifact_type_field=artifact_type_field,
                             overlap_correction=overlap_correction,
                             regression_strategy=regression_strategy)
     assert len(rerps) == 1
     return rerps[0]
    def _fit_transform(self, data, y=None):
        eval_env = EvalEnvironment.capture(self.eval_env, reference=2)
        formula = _drop_intercept(self.formula, self.add_intercept)

        design = dmatrix(formula, data, eval_env=eval_env, NA_action=self.NA_action,
                         return_type='dataframe')
        self.design_ = design.design_info

        if self.return_type == 'dataframe':
            return design
        else:
            return np.array(design)
Exemple #6
0
def test_issue_11():
    # Give a sensible error message for level mismatches
    # (At some points we've failed to put an origin= on these errors)
    env = EvalEnvironment.capture()
    data = {"X": [0, 1, 2, 3], "Y": [1, 2, 3, 4]}
    formula = "C(X) + Y"
    new_data = {"X": [0, 0, 1, 2, 3, 3, 4], "Y": [1, 2, 3, 4, 5, 6, 7]}
    info = dmatrix(formula, data)
    try:
        build_design_matrices([info.design_info.builder], new_data)
    except PatsyError, e:
        assert e.origin == Origin(formula, 0, 4)
Exemple #7
0
def test_issue_11():
    # Give a sensible error message for level mismatches
    # (At some points we've failed to put an origin= on these errors)
    env = EvalEnvironment.capture()
    data = {"X" : [0,1,2,3], "Y" : [1,2,3,4]}
    formula = "C(X) + Y"
    new_data = {"X" : [0,0,1,2,3,3,4], "Y" : [1,2,3,4,5,6,7]}
    info = dmatrix(formula, data)
    try:
        build_design_matrices([info.design_info.builder], new_data)
    except PatsyError, e:
        assert e.origin == Origin(formula, 0, 4)
Exemple #8
0
    def from_formula(cls,
                     formula,
                     data,
                     window,
                     weights=None,
                     subset=None,
                     *args,
                     **kwargs):
        if subset is not None:
            data = data.loc[subset]
        eval_env = kwargs.pop("eval_env", None)
        if eval_env is None:
            eval_env = 2
        elif eval_env == -1:
            from patsy import EvalEnvironment

            eval_env = EvalEnvironment({})
        else:
            eval_env += 1  # we're going down the stack again
        missing = kwargs.get("missing", "skip")
        from patsy import NAAction, dmatrices

        na_action = NAAction(on_NA="raise", NA_types=[])
        result = dmatrices(
            formula,
            data,
            eval_env,
            return_type="dataframe",
            NA_action=na_action,
        )

        endog, exog = result
        if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2:
            raise ValueError("endog has evaluated to an array with multiple "
                             "columns that has shape {0}. This occurs when "
                             "the variable converted to endog is non-numeric"
                             " (e.g., bool or str).".format(endog.shape))

        kwargs.update({"missing": missing, "window": window})
        if weights is not None:
            kwargs["weights"] = weights
        mod = cls(endog, exog, *args, **kwargs)
        mod.formula = formula
        # since we got a dataframe, attach the original
        mod.data.frame = data
        return mod
Exemple #9
0
    def _fit_transform(self, data, y=None):
        eval_env = EvalEnvironment.capture(self.eval_env, reference=2)
        formula = _drop_intercept(self.formula, self.add_intercept)

        design = dmatrix(formula,
                         data,
                         eval_env=eval_env,
                         NA_action=self.NA_action,
                         return_type='dataframe')
        self.design_ = design.design_info

        if self.return_type == 'dataframe':
            return design
        else:
            return np.array(design)

        self.feature_names_ = design.design_info.column_names
        return np.array(design)
Exemple #10
0
    def fit(self, data, y=None):
        """Fit the scikit-learn model using the formula.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Contains features and possible labels.
            Column names need to match variables in formula.
        """
        eval_env = EvalEnvironment.capture(self.eval_env, reference=1)
        formula = _drop_intercept(self.formula, self.add_intercept)
        design_y, design_X = dmatrices(formula, data, eval_env=eval_env,
                                       NA_action=self.NA_action)
        self.design_y_ = design_y.design_info
        self.design_X_ = design_X.design_info
        # convert to 1d vector so we don't get a warning
        # from sklearn.
        design_y = column_or_1d(design_y)
        est = clone(self.estimator)
        self.estimator_ = est.fit(design_X, design_y)
        return self
Exemple #11
0
 def rerp(self,
          name,
          event_query,
          start_time,
          stop_time,
          formula,
          artifact_query="has _ARTIFACT_TYPE",
          artifact_type_field="_ARTIFACT_TYPE",
          overlap_correction=True,
          regression_strategy="auto",
          eval_env=0):
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     rerp_specs = [
         rERPSpec(name, event_query, start_time, stop_time, formula)
     ]
     return self.multi_rerp(rerp_specs,
                            artifact_query=artifact_query,
                            artifact_type_field=artifact_type_field,
                            overlap_correction=overlap_correction,
                            regression_strategy=regression_strategy,
                            eval_env=eval_env)
Exemple #12
0
    def transform(self, data):

        vectors = dict()
        matrices = dict()

        for term in self.termlist:
            for e in term.factors:
                state = {}
                eval_env = EvalEnvironment.capture(0)
                eval_env = eval_env.with_outer_namespace(self.feature_fns)
                passes = e.memorize_passes_needed(state, eval_env)
                mat = e.eval(state, data)

                is_var = len(mat.shape) == 1
                if is_var:
                    if isinstance(mat, pd.Series):
                        mat = mat.values
                    vectors[e.code] = np.reshape(mat, (mat.shape[0], 1))
                    #vectors[e.code] = mat
                elif isinstance(mat, (np.ndarray, spmatrix)):
                    matrices[e.code] = mat
                else:
                    raise RuntimeError("Unsupported data format: {}".format(
                        type(mat)))

        list_of_mats = list(vectors.values()) + list(matrices.values())

        num_sparse = len([l for l in list_of_mats if isinstance(l, spmatrix)])

        if num_sparse == 0:
            if len(list_of_mats) == 1:
                return list_of_mats[0]
            else:
                return np.concatenate(list_of_mats, axis=1)
        elif len(list_of_mats) >= 1:  # at least one sparse
            return hstack(list_of_mats, format='csr')
        else:
            print(list_of_mats)
            raise RuntimeError("No features found")
            return
Exemple #13
0
    def fit(self, data, y=None):
        """Fit the scikit-learn model using the formula.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Contains features and possible labels.
            Column names need to match variables in formula.
        """
        eval_env = EvalEnvironment.capture(self.eval_env, reference=1)
        formula = _drop_intercept(self.formula, self.add_intercept)
        design_y, design_X = dmatrices(formula,
                                       data,
                                       eval_env=eval_env,
                                       NA_action=self.NA_action)
        self.design_y_ = design_y.design_info
        self.design_X_ = design_X.design_info
        self.feature_names_ = design_X.design_info.column_names
        # convert to 1d vector so we don't get a warning
        # from sklearn.
        design_y = column_or_1d(design_y)
        est = clone(self.estimator)
        self.estimator_ = est.fit(design_X, design_y)
        return self
Exemple #14
0
    def multi_rerp(
            self,
            rerp_specs,
            artifact_query="has _ARTIFACT_TYPE",
            artifact_type_field="_ARTIFACT_TYPE",
            overlap_correction=True,
            # This can be "continuous", "by-epoch", or "auto". If
            # "continuous", we always build one giant regression model,
            # treating the data as continuous. If "auto", we use the
            # (much faster) approach of generating a single regression
            # model and then applying it to each latency separately --
            # but *only* if this will produce the same result as doing
            # the full regression. If "epoch", then we either use the
            # fast method, or else error out. Changing this argument
            # never affects the actual output of this function -- if it
            # does, that's a bug! In general, we can do the fast thing
            # if:
            # -- any artifacts affect either all or none of each
            #    epoch, and
            # -- either, overlap_correction=False,
            # -- or, overlap_correction=True and there are in fact no
            #    overlaps.
            regression_strategy="auto",
            eval_env=0):
        eval_env = EvalEnvironment.capture(eval_env, reference=1)
        return multi_rerp_impl(self, rerp_specs, artifact_query,
                               artifact_type_field, overlap_correction,
                               regression_strategy, eval_env)

        # For artifact and bad data in general counting:
        # make an intervalset for each kind of bad data
        # intersect each with the "wanted data" spans to throw away
        # irrelevantly bad data
        # and then do a special union operation that counts which and how many
        # of the inputs is non-zero at each point, to calculate shares

        # First get a representation of all okay data
        # starting with which spans we have recordings for,
        # then subtract artifacts,
        # then subtract NAs
        # Then for the rest of the data,
        epoch_spans = []
        bad_spans = []
        for (name, event_query, start_time, stop_time, formula) in rerp_specs:

            event_set = self.events.query(event_query)

        # Make a design matrix for each
        # Figure out which data points are okay:
        #   -- where there is some entry in the design matrix
        #   -- where there is no artifact
        #   -- where all the design matrixes are non-NA

        # list like (start, stop, info), sort then scan to find overlaps. info
        # can be a reference to a row in a design matrix, or it could be a
        # note that the given span is off-limits.
        # If overlap_correction=False, we are going to handle each data span
        # individually. The only question is whether any of them have partial
        # overlaps with artifacts -- if so, then we need to do the
        # And if regress_by_epoch is auto or
        pass
    def __init__(
            self,  # Observations
            suitability_formula,
            data,
            # Spatial structure
            n_neighbors,
            neighbors,
            # NA action
            NA_action="drop",
            # Predictions
            data_pred=None,
            # Environment
            eval_env=0,
            # Chains
            burnin=1000,
            mcmc=1000,
            thin=1,
            # Starting values
            beta_start=0,
            Vrho_start=1,
            # Priors
            mubeta=0,
            Vbeta=1000,
            priorVrho=-1.0,  # -1="1/Gamma"
            shape=0.5,
            rate=0.0005,
            Vrho_max=10,
            # Various
            seed=1234,
            verbose=1,
            save_rho=0,
            save_p=0):
        """Function to fit a model_binomial_iCAR model.

        The function model_binomial_iCAR estimates the parameters of a
        Binomial model with iCAR process for spatial autocorrelation
        in a hierarchical Bayesian framework.

        :param suitability_formula: A formula-like object that can be
        used to construct a design matrix (see ``patsy.dmatrices``).

        :param data: A dict-like object that can be used to look up
        variables referenced in ``suitability_formula``.

        :param n_neighbors: A vector of integers that indicates the
        number of neighbors (adjacent entities) of each spatial
        entity. length(n.neighbors) indicates the total number of
        spatial entities.

        :param neighbors: A vector of integers indicating the
        neighbors (adjacent entities) of each spatial entity. Must be
        of the form c(neighbors of entity 1, neighbors of entity 2,
        ... , neighbors of the last entity). Length of the neighbors
        vector should be equal to sum(n.neighbors).

        :param NA_action: What to do with rows that contain missing
        values (see ``patsy.dmatrices``).

        :param data_pred: Optional dataset for predictions.

        :param eval_env: Environment used to look up any variables
        referenced in suitability_formula that cannot be found in data
        (see ``patsy.dmatrices``).

        :param burnin: Number of iterations for the burnin phase.

        :param mcmc: The number of Gibbs iterations for the
        sampler. Total number of Gibbs iterations is equal to
        burnin+mcmc. burnin+mcmc must be divisible by 10 and superior
        or equal to 100 so that the progress bar can be displayed.

        :param thin: The thinning interval used in the simulation. The
        number of mcmc iterations must be divisible by this value.

        :param beta_start: Starting values for beta parameters. This
        can either be a scalar or a p-length vector.

        :param Vrho_start: Positive scalar indicating the starting
        value for the variance of the spatial random effects.

        :param mubeta: Means of the priors for the beta parameters of the
        suitability process. mubeta must be either a scalar or a
        p-length vector. If mubeta takes a scalar value, then that
        value will serve as the prior mean for all of the betas. The
        default value is set to 0 for an uninformative prior.

        :param Vbeta: Variances of the Normal priors for the beta
        parameters of the suitability process. Vbeta must be either a
        scalar or a p-length vector. If Vbeta takes a scalar value,
        then that value will serve as the prior variance for all of
        the betas. The default variance is large and set to 1000 for
        an uninformative flat prior.

        :param priorVrho: Type of prior for the variance of the
        spatial random effects. Can be set to a fixed positive scalar,
        or to an inverse-gamma distribution ("1/Gamma") with
        parameters shape and rate, or to a uniform distribution
        ("Uniform") on the interval [0,Vrho.max]. Default set to
        "1/Gamma".

        :param shape: The shape parameter for the Gamma prior on the
        precision of the spatial random effects. Default value is
        shape=0.5 for uninformative prior.

        :param rate: The rate (1/scale) parameter for the Gamma prior
        on the precision of the spatial random effects. Default value
        is rate=0.0005 for uninformative prior.

        :param Vrho_max: Upper bound for the uniform prior of the
        spatial random effect variance. Default set to 10.

        :param seed: The seed for the random number generator. Default
        set to 1234.

        :param verbose: A switch (0,1) which determines whether or not
        the progress of the sampler is printed to the screen. Default
        is 1: a progress bar is printed, indicating the step (in %)
        reached by the Gibbs sampler.

        :param save_rho: A switch (0,1) which determines whether or
        not the sampled values for rhos are saved. Default is 0: the
        posterior mean is computed and returned in the rho.pred
        vector. Be careful, setting save.rho to 1 might require a
        large amount of memory.

        :param save_p: A switch (0,1) which determines whether or not
        the sampled values for predictions are saved. Default is 0:
        the posterior mean is computed and returned in the theta.pred
        vector. Be careful, setting save.p to 1 might require a large
        amount of memory.

        :return: An object of class model_binomial_iCAR.

        """

        # ====================
        # Model specifications
        # ====================

        self.model_type = "binomial_iCAR"
        self.suitability_formula = suitability_formula
        self.data = data
        self.n_neighbors = n_neighbors
        self.neighbors = neighbors
        self.NA_action = NA_action
        self.data_pred = data_pred
        self.eval_env = eval_env
        self.burnin = burnin
        self.mcmc = mcmc
        self.thin = thin
        self.beta_start = beta_start
        self.Vrho_start = Vrho_start
        self.mubeta = mubeta
        self.Vbeta = Vbeta
        self.priorVrho = priorVrho
        self.shape = shape
        self.rate = rate
        self.Vrho_max = Vrho_max
        self.seed = seed
        self.verbose = verbose
        self.save_rho = save_rho
        self.save_p = save_p

        # ========
        # Form response, covariate matrices and model parameters
        # ========

        # Patsy
        eval_env = EvalEnvironment.capture(eval_env, reference=1)
        y, x = dmatrices(suitability_formula, data, eval_env, NA_action)
        self._y_design_info = y.design_info
        self._x_design_info = x.design_info

        # Response
        Y = y[:, 0]
        nobs = len(Y)
        T = y[:, 1]
        # Suitability
        X_arr = x[:, :-1]  # We remove the last column (cells)
        ncol_X = X_arr.shape[1]
        X = X_arr.flatten("F")  # Flatten X by column (R/Fortran style)
        # Spatial correlation
        ncell = len(n_neighbors)
        cells = x[:, -1]  # Last column of x
        # Predictions
        if (data_pred is None):
            X_pred = X
            cells_pred = cells
            npred = nobs
        if (data_pred is not None):
            (x_pred, ) = build_design_matrices([self._x_design_info],
                                               data_pred)
            X_pred = x_pred[:, :-1]
            X_pred = X_pred.flatten("F")  # Flatten X_pred
            cells_pred = x_pred[:, -1]
            npred = len(cells_pred)
        # Model parameters
        npar = ncol_X
        ngibbs = mcmc + burnin
        nthin = thin
        nburn = burnin
        nsamp = mcmc // thin

        # ========
        # Initial starting values for M-H
        # ========

        if (np.size(beta_start) == 1 and beta_start == -99):
            # Use starting coefficient from logistic regression
            print("Using estimates from classic logistic regression as"
                  " starting values for betas")
            mod_LR = LogisticRegression(solver="lbfgs")
            mod_LR = mod_LR.fit(X_arr, Y)
            beta_start = np.ravel(mod_LR.coef_)
        if (np.size(beta_start) == 1 and beta_start != -99):
            beta_start = np.ones(npar) * beta_start
        else:
            beta_start = beta_start
        rho_start = np.zeros(ncell)  # Set to zero
        Vrho_start = Vrho_start

        # ========
        # Form and check priors
        # ========
        if (np.size(mubeta) == 1):
            mubeta = np.ones(npar) * mubeta
        else:
            mubeta = mubeta
        if (np.size(Vbeta) == 1):
            Vbeta = np.ones(npar) * Vbeta
        else:
            Vbeta = Vbeta
        shape = shape
        rate = rate
        Vrho_max = Vrho_max
        priorVrho = priorVrho

        # ========
        # call C code to draw sample
        # ========

        Sample = hbm.binomial_iCAR(
            # Constants and data
            ngibbs=int(ngibbs),
            nthin=int(nthin),
            nburn=int(nburn),
            nobs=int(nobs),
            ncell=int(ncell),
            np=int(npar),
            Y_obj=Y.astype(np.int32),
            T_obj=T.astype(np.int32),
            X_obj=X.astype(np.float64),  # X must be flattened.
            # Spatial correlation
            C_obj=cells.astype(np.int32),  # Must start at 0 for C.
            nNeigh_obj=n_neighbors.astype(np.int32),
            Neigh_obj=neighbors.astype(np.int32),  # Must start at 0 for C.
            # Predictions
            npred=int(npred),
            X_pred_obj=X_pred.astype(np.float64),
            C_pred_obj=cells_pred.astype(np.int32),
            # Starting values for M-H
            beta_start_obj=beta_start.astype(np.float64),
            rho_start_obj=rho_start.astype(np.float64),
            Vrho_start=float(Vrho_start),
            # Defining priors
            mubeta_obj=mubeta.astype(np.float64),
            Vbeta_obj=Vbeta.astype(np.float64),
            priorVrho=float(priorVrho),
            shape=float(shape),
            rate=float(rate),
            Vrho_max=float(Vrho_max),
            # Seed
            seed=int(seed),
            # Verbose
            verbose=int(verbose),
            # Save rho and p
            save_rho=int(save_rho),
            save_p=int(save_p))

        # Array of MCMC samples
        MCMC = np.zeros(shape=(nsamp, npar + 2))
        MCMC[:, :npar] = np.array(Sample[0]).reshape(npar, nsamp).transpose()
        MCMC[:, npar] = Sample[2]
        MCMC[:, npar + 1] = Sample[3]
        self.mcmc = MCMC
        posterior_means = np.mean(MCMC, axis=0)
        self.betas = posterior_means[:-2]
        self.Vrho = posterior_means[-2]
        self.deviance = posterior_means[-1]

        # Save rho
        if (save_rho == 0):
            self.rho = np.array(Sample[1])
        if (save_rho == 1):
            self.rho = np.array(Sample[1]).reshape(ncell, nsamp).transpose()

        # Save pred
        if (save_p == 0):
            self.theta_pred = np.array(Sample[5])
        if (save_p == 1):
            self.theta_pred = np.array(Sample[5]).reshape(npred,
                                                          nsamp).transpose()

        # theta_latent
        self.theta_latent = np.array(Sample[4])
    def __init__(
            self,  # Observations
            suitability_formula,
            data,
            # Spatial structure
            n_neighbors,
            neighbors,
            # NA action
            NA_action="drop",
            # Predictions
            data_pred=None,
            # Environment
            eval_env=0,
            # Chains
            burnin=1000,
            mcmc=1000,
            thin=1,
            # Starting values
            beta_start=0,
            Vrho_start=1,
            # Priors
            mubeta=0,
            Vbeta=1.0e6,
            priorVrho=-1.0,  # -1="1/Gamma"
            shape=0.5,
            rate=0.0005,
            Vrho_max=10,
            # Various
            seed=1234,
            verbose=1,
            save_rho=0,
            save_p=0):
        """Function to fit a model_binomial_iCAR model.

        The function model_binomial_iCAR estimates the parameters of a
        Binomial model with iCAR process for spatial autocorrelation
        in a hierarchical Bayesian framework.
        """

        # ====================
        # Model specifications
        # ====================

        self.model_type = "binomial_iCAR"
        self.suitability_formula = suitability_formula
        self.data = data
        self.n_neighbors = n_neighbors
        self.neighbors = neighbors
        self.NA_action = NA_action
        self.data_pred = data_pred
        self.eval_env = eval_env
        self.burnin = burnin
        self.mcmc = mcmc
        self.thin = thin
        self.beta_start = beta_start
        self.Vrho_start = Vrho_start
        self.mubeta = mubeta
        self.Vbeta = Vbeta
        self.priorVrho = priorVrho
        self.shape = shape
        self.rate = rate
        self.Vrho_max = Vrho_max
        self.seed = seed
        self.verbose = verbose
        self.save_rho = save_rho
        self.save_p = save_p

        # ========
        # Form response, covariate matrices and model parameters
        # ========

        # Patsy
        eval_env = EvalEnvironment.capture(eval_env, reference=1)
        y, x = dmatrices(suitability_formula, data, eval_env, NA_action)
        self._y_design_info = y.design_info
        self._x_design_info = x.design_info

        # Response
        Y = y[:, 0]
        nobs = len(Y)
        T = y[:, 1]
        # Suitability
        X_arr = x[:, :-1]  # We remove the last column (cells)
        ncol_X = X_arr.shape[1]
        X = X_arr.flatten("F")  # Flatten X by column (R/Fortran style)
        # Spatial correlation
        ncell = len(n_neighbors)
        cells = x[:, -1]  # Last column of x
        # Predictions
        if (data_pred is None):
            X_pred = X
            cells_pred = cells
            npred = nobs
        if (data_pred is not None):
            (x_pred, ) = build_design_matrices([self._x_design_info],
                                               data_pred)
            X_pred = x_pred[:, :-1]
            X_pred = X_pred.flatten("F")  # Flatten X_pred
            cells_pred = x_pred[:, -1]
            npred = len(cells_pred)
        # Model parameters
        npar = ncol_X
        ngibbs = mcmc + burnin
        nthin = thin
        nburn = burnin
        nsamp = mcmc // thin

        # ========
        # Initial starting values for M-H
        # ========

        if (np.size(beta_start) == 1 and beta_start == -99):
            # Use starting coefficient from logistic regression
            print("Using estimates from classic logistic regression as"
                  " starting values for betas")
            mod_LR = LogisticRegression(solver="lbfgs")
            mod_LR = mod_LR.fit(X_arr, Y)
            beta_start = np.ravel(mod_LR.coef_)
        if (np.size(beta_start) == 1 and beta_start != -99):
            beta_start = np.ones(npar) * beta_start
        else:
            beta_start = beta_start
        rho_start = np.zeros(ncell)  # Set to zero
        Vrho_start = Vrho_start

        # ========
        # Form and check priors
        # ========
        if (np.size(mubeta) == 1):
            mubeta = np.ones(npar) * mubeta
        else:
            mubeta = mubeta
        if (np.size(Vbeta) == 1):
            Vbeta = np.ones(npar) * Vbeta
        else:
            Vbeta = Vbeta
        shape = shape
        rate = rate
        Vrho_max = Vrho_max
        priorVrho = priorVrho

        # ========
        # call C code to draw sample
        # ========

        Sample = hsdm.binomial_iCAR(
            # Constants and data
            ngibbs=int(ngibbs),
            nthin=int(nthin),
            nburn=int(nburn),
            nobs=int(nobs),
            ncell=int(ncell),
            np=int(npar),
            Y_obj=Y.astype(np.int32),
            T_obj=T.astype(np.int32),
            X_obj=X.astype(np.float64),  # X must be flattened.
            # Spatial correlation
            C_obj=cells.astype(np.int32),  # Must start at 0 for C.
            nNeigh_obj=n_neighbors.astype(np.int32),
            Neigh_obj=neighbors.astype(np.int32),  # Must start at 0 for C.
            # Predictions
            npred=int(npred),
            X_pred_obj=X_pred.astype(np.float64),
            C_pred_obj=cells_pred.astype(np.int32),
            # Starting values for M-H
            beta_start_obj=beta_start.astype(np.float64),
            rho_start_obj=rho_start.astype(np.float64),
            Vrho_start=float(Vrho_start),
            # Defining priors
            mubeta_obj=mubeta.astype(np.float64),
            Vbeta_obj=Vbeta.astype(np.float64),
            priorVrho=float(priorVrho),
            shape=float(shape),
            rate=float(rate),
            Vrho_max=float(Vrho_max),
            # Seed
            seed=int(seed),
            # Verbose
            verbose=int(verbose),
            # Save rho and p
            save_rho=int(save_rho),
            save_p=int(save_p))

        # Array of MCMC samples
        MCMC = np.zeros(shape=(nsamp, npar + 2))
        MCMC[:, :npar] = np.array(Sample[0]).reshape(npar, nsamp).transpose()
        MCMC[:, npar] = Sample[2]
        MCMC[:, npar + 1] = Sample[3]
        self.mcmc = MCMC
        posterior_means = np.mean(MCMC, axis=0)
        self.betas = posterior_means[:-2]
        self.Vrho = posterior_means[-2]
        self.deviance = posterior_means[-1]

        # Save rho
        if (save_rho == 0):
            self.rho = np.array(Sample[1])
        if (save_rho == 1):
            self.rho = np.array(Sample[1]).reshape(ncell, nsamp).transpose()

        # Save pred
        if (save_p == 0):
            self.theta_pred = np.array(Sample[5])
        if (save_p == 1):
            self.theta_pred = np.array(Sample[5]).reshape(npred,
                                                          nsamp).transpose()

        # theta_latent
        self.theta_latent = np.array(Sample[4])
Exemple #17
0
Age_Calc:Caseduplication   1.4583     2.1848   0.667 0.506120 
# Now to try this in `patsy`.  
# 
# Steps:  
# 1. See how the model description is derived from the formula  
# 2. Build the design matrix that the formula specifies  
# 3. Use the design matrix in order to create the model in `scikit-learn`

# In[87]:

from patsy import ModelDesc, EvalEnvironment


# In[88]:

env = EvalEnvironment.capture()
predicted_lat_age_mtx = ModelDesc.from_formula('Predicted ~ Age_Calc * Case', env)


# In[89]:

predicted_lat_age_mtx


# In[90]:

from patsy import dmatrix


# In[91]: