def linear_model(input_data, output_data, k_value, rmse_graph): print '=============================================================================' print 'Linear Regression Results:' folds = k_value #Cross validation for linear regression from sklearn import cross_validation kf = cross_validation.KFold(len(input_data), shuffle=True, n_folds=k_value) totRMSE_Reg = [] # List to store all the linear regression parameters clf1 = map(lambda x: 0, range(folds)) predict_store = [] actualy_store = [] i = 0 for train_index, test_index in kf: X_train, X_test = input_data[train_index], input_data[test_index] y_train, y_test = output_data[train_index], output_data[test_index] # model calling for each algo continous #Linear regression from numpy.linalg import inv import statsmodels.api as sm import statsmodels.formula.api as smf X1 = sm.add_constant(X_train) model2 = sm.OLS(y_train, X1) clf1[i] = model2.fit() import statsmodels.regression.linear_model as srl result = srl.RegressionResults(model2, clf1[i].params) dot_pr = np.dot(sm.add_constant(X_test), result.params) predict_store.append(dot_pr.tolist()) actualy_store.append(output_data[test_index].tolist()) RMSE = (np.sum( (np.dot(sm.add_constant(X_test), result.params) - y_test)**2) / X_test.shape[0])**.5 totRMSE_Reg.append(RMSE) i = i + 1 RMSE_min = min(totRMSE_Reg) RMSE_min_index = totRMSE_Reg.index(RMSE_min) summary = raw_input( 'Need the summary of Linear Regression then give "y" or "Y":') if (summary == 'y' or summary == 'Y'): clf = clf1[RMSE_min_index] print clf.summary() #Homodescadisiticy check tests related to linear regression# import statsmodels.stats.diagnostic as ssd test1 = ssd.het_breushpagan(clf.resid, clf.model.exog) test1_stat = ([ 'LM stat: ', 'p-val LM test: ', 'f-stat(error var,not depend on x):', 'p-value for the f-stat :' ]) Zip_result = np.array(zip(test1_stat, test1)) print '=============================================================================' print 'Breush-pagan:\n', Zip_result.reshape(len(Zip_result), 2) print '=============================================================================' print 'Avg. RMSE_Regression for Given K_value Folds', sum( totRMSE_Reg) / kf.n_folds if (rmse_graph == 1): #graphical represenation predict_store1 = [ item for sublist in predict_store for item in sublist ] actualy_store1 = [ item for sublist in actualy_store for item in sublist ] #graph values of y_predicted and actualy import matplotlib.pyplot as plt indices = range(len(actualy_store1)) plt.plot(indices, predict_store1, 'yo-') plt.hold(True) plt.text(2, 19, r'Blue=Actual,Yellow=Predicted') plt.plot(indices, actualy_store1, 'bo-') plt.title('Actual Vs Predcited Graph') plt.ylabel('Target variables') plt.xlabel('No. of Datasets') plt.show()
def _fit_pirls(self, alpha, start_params=None, maxiter=100, tol=1e-8, scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None, weights=None): """fit model with penalized reweighted least squares """ # TODO: this currently modifies several attributes # self.scale, self.scaletype, self.mu, self.weights # self.data_weights, # and possibly self._offset_exposure # several of those might not be necessary, e.g. mu and weights # alpha = alpha * len(y) * self.scale / 100 # TODO: we need to rescale alpha endog = self.endog wlsexog = self.exog # smoother.basis spl_s = self.penal.penalty_matrix(alpha=alpha) nobs, n_columns = wlsexog.shape # TODO what are these values? if weights is None: self.data_weights = np.array([1.] * nobs) else: self.data_weights = weights if not hasattr(self, '_offset_exposure'): self._offset_exposure = 0 self.scaletype = scale # TODO: check default scale types # self.scaletype = 'dev' # during iteration self.scale = 1 if start_params is None: mu = self.family.starting_mu(endog) lin_pred = self.family.predict(mu) else: lin_pred = np.dot(wlsexog, start_params) + self._offset_exposure mu = self.family.fitted(lin_pred) dev = self.family.deviance(endog, mu) history = dict(params=[None, start_params], deviance=[np.inf, dev]) converged = False criterion = history['deviance'] # This special case is used to get the likelihood for a specific # params vector. if maxiter == 0: mu = self.family.fitted(lin_pred) self.scale = self.estimate_scale(mu) wls_results = lm.RegressionResults(self, start_params, None) iteration = 0 for iteration in range(maxiter): # TODO: is this equivalent to point 1 of page 136: # w = 1 / (V(mu) * g'(mu)) ? self.weights = self.data_weights * self.family.weights(mu) # TODO: is this equivalent to point 1 of page 136: # z = g(mu)(y - mu) + X beta ? wlsendog = (lin_pred + self.family.link.deriv(mu) * (endog - mu) - self._offset_exposure) # this defines the augmented matrix point 2a on page 136 wls_results = penalized_wls(wlsendog, wlsexog, spl_s, self.weights) lin_pred = np.dot(wlsexog, wls_results.params).ravel() lin_pred += self._offset_exposure mu = self.family.fitted(lin_pred) # We don't need to update scale in GLM/LEF models # We might need it in dispersion models. # self.scale = self.estimate_scale(mu) history = self._update_history(wls_results, mu, history) if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0): msg = "Perfect separation detected, results not available" raise PerfectSeparationError(msg) # TODO need atol, rtol # args of _check_convergence: (criterion, iteration, atol, rtol) converged = _check_convergence(criterion, iteration, tol, 0) if converged: break self.mu = mu self.scale = self.estimate_scale(mu) glm_results = GLMGamResults(self, wls_results.params, wls_results.normalized_cov_params, self.scale, cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t) glm_results.method = "PIRLS" history['iteration'] = iteration + 1 glm_results.fit_history = history glm_results.converged = converged return GLMGamResultsWrapper(glm_results)
def fit(self, start_params=None, maxiter=100, method='IRLS', tol=1e-8, scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs): """ Fits a generalized linear model for a given family. parameters ---------- maxiter : int, optional Default is 100. method : string Default is 'IRLS' for iteratively reweighted least squares. This is currently the only method available for GLM fit. scale : string or float, optional `scale` can be 'X2', 'dev', or a float The default value is None, which uses `X2` for Gamma, Gaussian, and Inverse Gaussian. `X2` is Pearson's chi-square divided by `df_resid`. The default is 1 for the Binomial and Poisson families. `dev` is the deviance divided by df_resid tol : float Convergence tolerance. Default is 1e-8. start_params : array-like, optional Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the ``family.starting_mu(endog)``. If start_params is given then the initial mean will be calculated as ``np.dot(exog, start_params)``. Notes ----- This method does not take any extra undocumented ``kwargs``. """ endog = self.endog if endog.ndim > 1 and endog.shape[1] == 2: data_weights = endog.sum(1) # weights are total trials else: data_weights = np.ones((endog.shape[0])) self.data_weights = data_weights if np.shape(self.data_weights) == () and self.data_weights > 1: self.data_weights = self.data_weights * np.ones((endog.shape[0])) self.scaletype = scale if isinstance(self.family, families.Binomial): # this checks what kind of data is given for Binomial. # family will need a reference to endog if this is to be removed from # preprocessing self.endog = self.family.initialize(self.endog) # Construct a combined offset/exposure term. Note that # exposure has already been logged if present. offset_exposure = 0. if hasattr(self, 'offset'): offset_exposure = self.offset if hasattr(self, 'exposure'): offset_exposure = offset_exposure + self.exposure self._offset_exposure = offset_exposure wlsexog = self.exog if start_params is None: mu = self.family.starting_mu(self.endog) lin_pred = self.family.predict(mu) else: lin_pred = np.dot(wlsexog, start_params) + offset_exposure mu = self.family.fitted(lin_pred) dev = self.family.deviance(self.endog, mu) if np.isnan(dev): raise ValueError("The first guess on the deviance function " "returned a nan. This could be a boundary " " problem and should be reported.") # first guess on the deviance is assumed to be scaled by 1. # params are none to start, so they line up with the deviance history = dict(params=[None, start_params], deviance=[np.inf, dev]) converged = False criterion = history['deviance'] # This special case is used to get the likelihood for a specific # params vector. if maxiter == 0: mu = self.family.fitted(lin_pred) self.scale = self.estimate_scale(mu) wls_results = lm.RegressionResults(self, start_params, None) iteration = 0 for iteration in range(maxiter): self.weights = data_weights * self.family.weights(mu) wlsendog = (lin_pred + self.family.link.deriv(mu) * (self.endog - mu) - offset_exposure) wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit() lin_pred = np.dot(self.exog, wls_results.params) + offset_exposure mu = self.family.fitted(lin_pred) history = self._update_history(wls_results, mu, history) self.scale = self.estimate_scale(mu) if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0): msg = "Perfect separation detected, results not available" raise PerfectSeparationError(msg) converged = _check_convergence(criterion, iteration, tol) if converged: break self.mu = mu glm_results = GLMResults(self, wls_results.params, wls_results.normalized_cov_params, self.scale, cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t) history['iteration'] = iteration + 1 glm_results.fit_history = history return GLMResultsWrapper(glm_results)