def estimate_movement_std(position_info): MODEL_FORMULA = 'position ~ lagged_position - 1' response, design_matrix = dmatrices(MODEL_FORMULA, position_info) fit = GLM(response, design_matrix, family=families.Gaussian()).fit() return np.sqrt(fit.scale)
def estimate_movement_variance(position, lagged_position, speed): data = { 'position': position, 'lagged_position': lagged_position } MODEL_FORMULA = 'position ~ lagged_position - 1' response, design_matrix = dmatrices(MODEL_FORMULA, data) fit = GLM(response, design_matrix, family=families.Gaussian()).fit() return np.sqrt(fit.scale)
def estimate_movement_std(position): '''Estimates the movement standard deviation based on position. WARNING: Need to use on original position, not interpolated position. Parameters ---------- position : ndarray, shape (n_time, n_position_dim) Returns ------- movement_std : ndarray, shape (n_position_dim,) ''' position = atleast_2d(position) is_nan = np.any(np.isnan(position), axis=1) position = position[~is_nan] movement_std = [] for p in position.T: fit = GLM(p[:-1], p[1:], family=families.Gaussian()).fit() movement_std.append(np.sqrt(fit.scale)) return np.array(movement_std)
def run_regression(x, y, feature_df, model=families.Gaussian()): # regression_results = sm.GLM(y, x, family=model).fit(maxiter=1000) # feature_df["pvalue"] = list(regression_results.pvalues) # feature_df["coefficient"] = list(regression_results.params) # r2 = compute_rsquares(y, regression_results) nr, nc = x.shape r_x = robjects.r.matrix(x, nrow=nr, ncol=nc) r_y = robjects.r.array(y) results = stats.glm_fit(r_x, r_y, family=stats.gaussian(), factors=feature_df["features"]) fitted_y = list(stats.fitted(results)) col_names = list(stats.summary_glm(results).rx2('coefficients').colnames) data = pandas2ri.ri2py(stats.summary_glm(results).rx2('coefficients')) dataset = pd.DataFrame( {col_names[i]: data[:, i] for i in range(len(col_names))}) r2 = compute_rpy2_rsquares(y, fitted_y) return feature_df, r2
"""Calculates the evidence of being in a replay state based on the current speed and the speed in the previous time step. """ from functools import partial import numpy as np from patsy import dmatrices from statsmodels.api import GLM, families from statsmodels.tsa.tsatools import lagmat from .core import scale_likelihood FAMILY = families.Gaussian(link=families.links.log()) FORMULA = 'speed ~ lagged_speed - 1' def speed_likelihood(speed, lagged_speed, replay_coefficients, replay_scale, no_replay_coefficients, no_replay_scale, speed_threshold=4.0): """Calculates the evidence of being in a replay state based on the current speed and the speed in the previous time step. Parameters ---------- speed : ndarray, shape (n_time,) lagged_speed : ndarray, shape (n_time,)
def penalized_IRLS(design_matrix, response, sqrt_penalty_matrix=None, penalty=_EPS, family=families.Gaussian(), max_iterations=25, prior_weights=None, offset=None, tolerance=1E-8): '''Estimate coefficients and associated statistics of models in the exponential family. Parameters ---------- design_matrix : ndarray, shape (n_observations, n_covariates) response : ndarray, shape (n_observations,) sqrt_penalty_matrix : ndarray, optional, shape (n_observations, n_observations) penalty : ndarray, optional, shape (n_observations,) family : statsmodels.api.family instance, optional max_iterations : int, optional prior_weights : ndarray, optional, shape (n_observations,) offset : ndarray, optional, shape (n_observations,) tolerance : float, optional Returns ------- coefficients : ndarray, shape (n_covariates,) is_converged : bool coefficient_covariance : ndarray, shape (n_covariates, n_covariates) aic : float deviance : float degrees_of_freedom : float scale : float ''' if design_matrix.ndim < 2: design_matrix = design_matrix[:, np.newaxis] if response.ndim < 2: response = response[:, np.newaxis] n_observations, n_covariates = design_matrix.shape if prior_weights is None: prior_weights = np.ones_like(response) if offset is None: offset = np.zeros_like(response) if sqrt_penalty_matrix is None: sqrt_penalty_matrix = np.eye(n_covariates, dtype=design_matrix.dtype) is_converged = False predicted_response = family.starting_mu(response) linear_predictor = family.link(predicted_response) sqrt_penalty_matrix = np.sqrt(penalty) * sqrt_penalty_matrix augmented_weights = np.ones_like(response[:n_covariates]) full_design_matrix = np.concatenate((design_matrix, sqrt_penalty_matrix)) augmented_response = np.zeros_like(response[:n_covariates]) coefficients = np.zeros((n_covariates, )) for _ in range(max_iterations): link_derivative = family.link.deriv(predicted_response) pseudo_data = (linear_predictor + (response - predicted_response) * link_derivative - offset) weights = prior_weights / (family.variance(predicted_response) * link_derivative**2) full_response = np.concatenate((pseudo_data, augmented_response)) full_weights = np.concatenate((np.sqrt(weights), augmented_weights)) coefficients_old = coefficients.copy() try: coefficients = np.linalg.lstsq(full_design_matrix * full_weights, full_response * full_weights, rcond=None)[0] except (np.linalg.LinAlgError, ValueError): logger.warn( 'Weighted least squares failed. Returning NaN coefficiients.') coefficients *= np.nan break linear_predictor = offset + design_matrix @ coefficients predicted_response = family.link.inverse(linear_predictor) # use deviance change instead? coefficients_change = np.linalg.norm(coefficients - coefficients_old) if coefficients_change < tolerance: is_converged = True break U, singular_values, Vt = _weighted_design_matrix_svd( design_matrix, sqrt_penalty_matrix, weights) degrees_of_freedom = get_effective_degrees_of_freedom(U) scale, is_estimated_scale = estimate_scale(family, response, predicted_response, prior_weights, degrees_of_freedom) coefficient_covariance = get_coefficient_covariance( U, singular_values, Vt, scale) deviance = family.deviance(response, predicted_response, prior_weights, scale) log_likelihood = family.loglike(response, predicted_response, prior_weights, scale) aic = estimate_aic(log_likelihood, degrees_of_freedom) return Results(coefficients=np.squeeze(coefficients), is_converged=is_converged, coefficient_covariance=coefficient_covariance, AIC=aic, deviance=deviance, degrees_of_freedom=degrees_of_freedom, scale=scale)
def fit_speed_model(speed, lagged_speed): FORMULA = 'speed ~ lagged_speed - 1' response, design_matrix = dmatrices( FORMULA, dict(speed=speed, lagged_speed=lagged_speed)) family = families.Gaussian(link=families.links.log) return GLM(response, design_matrix, family=family).fit()