Ejemplo n.º 1
0
def linear_model(input_data, output_data, k_value, rmse_graph):
    print '============================================================================='
    print 'Linear Regression Results:'
    folds = k_value
    #Cross validation for linear regression
    from sklearn import cross_validation
    kf = cross_validation.KFold(len(input_data), shuffle=True, n_folds=k_value)
    totRMSE_Reg = []
    # List to store all the linear regression parameters
    clf1 = map(lambda x: 0, range(folds))
    predict_store = []
    actualy_store = []
    i = 0
    for train_index, test_index in kf:
        X_train, X_test = input_data[train_index], input_data[test_index]
        y_train, y_test = output_data[train_index], output_data[test_index]
        # model calling for each algo continous
        #Linear regression
        from numpy.linalg import inv
        import statsmodels.api as sm
        import statsmodels.formula.api as smf
        X1 = sm.add_constant(X_train)
        model2 = sm.OLS(y_train, X1)
        clf1[i] = model2.fit()
        import statsmodels.regression.linear_model as srl
        result = srl.RegressionResults(model2, clf1[i].params)
        dot_pr = np.dot(sm.add_constant(X_test), result.params)
        predict_store.append(dot_pr.tolist())
        actualy_store.append(output_data[test_index].tolist())
        RMSE = (np.sum(
            (np.dot(sm.add_constant(X_test), result.params) - y_test)**2) /
                X_test.shape[0])**.5
        totRMSE_Reg.append(RMSE)
        i = i + 1
    RMSE_min = min(totRMSE_Reg)
    RMSE_min_index = totRMSE_Reg.index(RMSE_min)
    summary = raw_input(
        'Need the summary of Linear Regression then give "y" or "Y":')
    if (summary == 'y' or summary == 'Y'):
        clf = clf1[RMSE_min_index]
        print clf.summary()
        #Homodescadisiticy check tests related to linear regression#
        import statsmodels.stats.diagnostic as ssd
        test1 = ssd.het_breushpagan(clf.resid, clf.model.exog)
        test1_stat = ([
            'LM stat: ', 'p-val LM test: ',
            'f-stat(error var,not depend on x):', 'p-value for the f-stat :'
        ])
        Zip_result = np.array(zip(test1_stat, test1))
        print '============================================================================='
        print 'Breush-pagan:\n', Zip_result.reshape(len(Zip_result), 2)
    print '============================================================================='
    print 'Avg. RMSE_Regression for Given K_value Folds', sum(
        totRMSE_Reg) / kf.n_folds
    if (rmse_graph == 1):
        #graphical represenation
        predict_store1 = [
            item for sublist in predict_store for item in sublist
        ]
        actualy_store1 = [
            item for sublist in actualy_store for item in sublist
        ]
        #graph values of y_predicted and actualy
        import matplotlib.pyplot as plt
        indices = range(len(actualy_store1))
        plt.plot(indices, predict_store1, 'yo-')
        plt.hold(True)
        plt.text(2, 19, r'Blue=Actual,Yellow=Predicted')
        plt.plot(indices, actualy_store1, 'bo-')
        plt.title('Actual Vs Predcited Graph')
        plt.ylabel('Target variables')
        plt.xlabel('No. of Datasets')
        plt.show()
Ejemplo n.º 2
0
    def _fit_pirls(self, alpha, start_params=None, maxiter=100, tol=1e-8,
                   scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None,
                   weights=None):
        """fit model with penalized reweighted least squares

        """
        # TODO: this currently modifies several attributes
        # self.scale, self.scaletype, self.mu, self.weights
        # self.data_weights,
        # and possibly self._offset_exposure
        # several of those might not be necessary, e.g. mu and weights

        # alpha = alpha * len(y) * self.scale / 100
        # TODO: we need to rescale alpha
        endog = self.endog
        wlsexog = self.exog  # smoother.basis
        spl_s = self.penal.penalty_matrix(alpha=alpha)

        nobs, n_columns = wlsexog.shape

        # TODO what are these values?
        if weights is None:
            self.data_weights = np.array([1.] * nobs)
        else:
            self.data_weights = weights

        if not hasattr(self, '_offset_exposure'):
            self._offset_exposure = 0

        self.scaletype = scale
        # TODO: check default scale types
        # self.scaletype = 'dev'
        # during iteration
        self.scale = 1

        if start_params is None:
            mu = self.family.starting_mu(endog)
            lin_pred = self.family.predict(mu)
        else:
            lin_pred = np.dot(wlsexog, start_params) + self._offset_exposure
            mu = self.family.fitted(lin_pred)
        dev = self.family.deviance(endog, mu)

        history = dict(params=[None, start_params], deviance=[np.inf, dev])
        converged = False
        criterion = history['deviance']
        # This special case is used to get the likelihood for a specific
        # params vector.
        if maxiter == 0:
            mu = self.family.fitted(lin_pred)
            self.scale = self.estimate_scale(mu)
            wls_results = lm.RegressionResults(self, start_params, None)
            iteration = 0

        for iteration in range(maxiter):

            # TODO: is this equivalent to point 1 of page 136:
            # w = 1 / (V(mu) * g'(mu))  ?
            self.weights = self.data_weights * self.family.weights(mu)

            # TODO: is this equivalent to point 1 of page 136:
            # z = g(mu)(y - mu) + X beta  ?
            wlsendog = (lin_pred + self.family.link.deriv(mu) * (endog - mu)
                        - self._offset_exposure)

            # this defines the augmented matrix point 2a on page 136
            wls_results = penalized_wls(wlsendog, wlsexog, spl_s, self.weights)
            lin_pred = np.dot(wlsexog, wls_results.params).ravel()
            lin_pred += self._offset_exposure
            mu = self.family.fitted(lin_pred)

            # We don't need to update scale in GLM/LEF models
            # We might need it in dispersion models.
            # self.scale = self.estimate_scale(mu)
            history = self._update_history(wls_results, mu, history)

            if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0):
                msg = "Perfect separation detected, results not available"
                raise PerfectSeparationError(msg)

            # TODO need atol, rtol
            # args of _check_convergence: (criterion, iteration, atol, rtol)
            converged = _check_convergence(criterion, iteration, tol, 0)
            if converged:
                break
        self.mu = mu
        self.scale = self.estimate_scale(mu)
        glm_results = GLMGamResults(self, wls_results.params,
                                    wls_results.normalized_cov_params,
                                    self.scale,
                                    cov_type=cov_type, cov_kwds=cov_kwds,
                                    use_t=use_t)

        glm_results.method = "PIRLS"
        history['iteration'] = iteration + 1
        glm_results.fit_history = history
        glm_results.converged = converged

        return GLMGamResultsWrapper(glm_results)
Ejemplo n.º 3
0
    def fit(self,
            start_params=None,
            maxiter=100,
            method='IRLS',
            tol=1e-8,
            scale=None,
            cov_type='nonrobust',
            cov_kwds=None,
            use_t=None,
            **kwargs):
        """
        Fits a generalized linear model for a given family.

        parameters
        ----------
        maxiter : int, optional
            Default is 100.
        method : string
            Default is 'IRLS' for iteratively reweighted least squares.  This
            is currently the only method available for GLM fit.
        scale : string or float, optional
            `scale` can be 'X2', 'dev', or a float
            The default value is None, which uses `X2` for Gamma, Gaussian,
            and Inverse Gaussian.
            `X2` is Pearson's chi-square divided by `df_resid`.
            The default is 1 for the Binomial and Poisson families.
            `dev` is the deviance divided by df_resid
        tol : float
            Convergence tolerance.  Default is 1e-8.
        start_params : array-like, optional
            Initial guess of the solution for the loglikelihood maximization.
            The default is family-specific and is given by the
            ``family.starting_mu(endog)``. If start_params is given then the
            initial mean will be calculated as ``np.dot(exog, start_params)``.

        Notes
        -----
        This method does not take any extra undocumented ``kwargs``.
        """
        endog = self.endog
        if endog.ndim > 1 and endog.shape[1] == 2:
            data_weights = endog.sum(1)  # weights are total trials
        else:
            data_weights = np.ones((endog.shape[0]))
        self.data_weights = data_weights
        if np.shape(self.data_weights) == () and self.data_weights > 1:
            self.data_weights = self.data_weights * np.ones((endog.shape[0]))
        self.scaletype = scale
        if isinstance(self.family, families.Binomial):
            # this checks what kind of data is given for Binomial.
            # family will need a reference to endog if this is to be removed from
            # preprocessing
            self.endog = self.family.initialize(self.endog)

        # Construct a combined offset/exposure term.  Note that
        # exposure has already been logged if present.
        offset_exposure = 0.
        if hasattr(self, 'offset'):
            offset_exposure = self.offset
        if hasattr(self, 'exposure'):
            offset_exposure = offset_exposure + self.exposure
        self._offset_exposure = offset_exposure

        wlsexog = self.exog
        if start_params is None:
            mu = self.family.starting_mu(self.endog)
            lin_pred = self.family.predict(mu)
        else:
            lin_pred = np.dot(wlsexog, start_params) + offset_exposure
            mu = self.family.fitted(lin_pred)
        dev = self.family.deviance(self.endog, mu)
        if np.isnan(dev):
            raise ValueError("The first guess on the deviance function "
                             "returned a nan.  This could be a boundary "
                             " problem and should be reported.")

        # first guess on the deviance is assumed to be scaled by 1.
        # params are none to start, so they line up with the deviance
        history = dict(params=[None, start_params], deviance=[np.inf, dev])
        converged = False
        criterion = history['deviance']
        # This special case is used to get the likelihood for a specific
        # params vector.
        if maxiter == 0:
            mu = self.family.fitted(lin_pred)
            self.scale = self.estimate_scale(mu)
            wls_results = lm.RegressionResults(self, start_params, None)
            iteration = 0
        for iteration in range(maxiter):
            self.weights = data_weights * self.family.weights(mu)
            wlsendog = (lin_pred + self.family.link.deriv(mu) *
                        (self.endog - mu) - offset_exposure)
            wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit()
            lin_pred = np.dot(self.exog, wls_results.params) + offset_exposure
            mu = self.family.fitted(lin_pred)
            history = self._update_history(wls_results, mu, history)
            self.scale = self.estimate_scale(mu)
            if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0):
                msg = "Perfect separation detected, results not available"
                raise PerfectSeparationError(msg)
            converged = _check_convergence(criterion, iteration, tol)
            if converged:
                break
        self.mu = mu

        glm_results = GLMResults(self,
                                 wls_results.params,
                                 wls_results.normalized_cov_params,
                                 self.scale,
                                 cov_type=cov_type,
                                 cov_kwds=cov_kwds,
                                 use_t=use_t)

        history['iteration'] = iteration + 1
        glm_results.fit_history = history
        return GLMResultsWrapper(glm_results)