# Isolate the MSFT returns msft_returns = all_returns.iloc[all_returns.index.get_level_values('Ticker') == 'MSFT'] msft_returns.index = msft_returns.index.droplevel('Ticker') # Build up a new DataFrame with AAPL and MSFT returns # concat along the column axis, without the index # added [1:] so that NaN values won't interfere with model return_data = pd.concat([aapl_returns, msft_returns], axis=1)[1:] return_data.columns = ['AAPL', 'MSFT'] # Add a a column of ones to an array X = sm.add_constant(return_data['AAPL']) # Construct the model model = sm.OLS(return_data['MSFT'], X).fit() # Print the summary # print(model.summary()) # Plot returns of AAPL and MSFT # plt.plot(return_data['AAPL'], return_data['MSFT'], 'r.') # # Add an axis to the plot # ax = plt.axis() # # Initialize `x` (x axis) # x = np.linspace(ax[0], ax[1] + 0.01) # # # Plot the regression line # plt.plot(x, model.params[0] + model.params[1] * x, 'b', lw=2) # # # Customize the plot # plt.grid(True) # plt.axis('tight')
import statsmodels.api as sm from scipy import mstats from statsmodels.sandbox.regression.predstd import wls_prediction_std X, Y = df[['Open']], df[['Close']] X = sm.add_constant(X) slope, intercept, r_value, p_value, std_err = stats.linregress(X.iloc[:, 0], Y.iloc[:, 0]) yhat = slope*X.iloc[:, 0] + intercept #this is the regression line Xf['yhat'] = yhat Xf[['Close', 'yhat']].plot() yhat.plot() model = sm.OLS(Y,X) result = model.fit() print(result.summary()) prstd, lower, upper = wls_prediction_std(result) Xf[['Close', prstd, lower, upper]].plot() prstd = pd.DataFrame(prstd, index = Xf.index, columns = ['prstd']) lower = pd.DataFrame(lower, index = Xf.index, columns = ['lower']) upper = pd.DataFrame(upper, index = Xf.index, columns = ['upper']) Xf['prstd'] = prstd Xf['lower'] = lower Xf['upper'] = upper Xf[['Close', 'prstd', 'lower', 'upper']].plot() #%% REGRESSION LINE #import statsmodels.api as sm
def fit(self, data_frame: DataFrame): self.data_frame = data_frame t_ref = data_frame.reference_temperature t_ref_vector = t_ref * np.ones(len(data_frame.temp)) c_0 = data_frame.reference_value c_1 = data_frame.reference_cvalue (self.aux_values, self.aux_weights) = auxiliary_function(data_frame) updated_experiment = data_frame.experiment - c_0 * np.ones(len(data_frame.temp)) - \ c_1 * (data_frame.temp - t_ref_vector) self.updated_matrix = np.column_stack( [data_frame.temp ** i + (i - 1) * t_ref_vector ** i - i * data_frame.temp * t_ref_vector ** (i - 1) \ for i in range(self.min_power, self.max_power + 1) if i not in [0, 1]]) ols_result = sm.OLS(updated_experiment, self.updated_matrix).fit() # cooks_distance_influential = 4/(len(self.data_frame.temp - (self.max_power - self.min_power) - 1)) # ols_cooks_distance = OLSInfluence(ols_result).cooks_distance[1] ols_stud_residuals = OLSInfluence(ols_result).dfbetas # ols_influence = OLSInfluence(ols_result).influence w = np.ones(len(data_frame.temp)) for residual, weight in zip(ols_stud_residuals, w): if residual > 2: w = 0.1 else: w = 1 self.aux_fit = sm.WLS(updated_experiment, self.updated_matrix, weights=w).fit() self.aux_coefficients = self.aux_fit.params a_1 = c_1 - \ sum([i * self.aux_coefficients[i - self.min_power] * t_ref ** (i - 1) \ for i in range(self.min_power, 0)]) - \ sum([i * self.aux_coefficients[i - 2 - self.min_power] * t_ref ** (i - 1) \ for i in range(2, self.max_power + 1)]) a_0 = c_0 - a_1 * t_ref - \ sum([self.aux_coefficients[i - self.min_power] * t_ref ** i \ for i in range(self.min_power, 0)]) - \ sum([self.aux_coefficients[i - 2 - self.min_power] * t_ref ** i \ for i in range(2, self.max_power + 1)]) self.fit_coefficients = [] self.fit_coefficients.extend(self.aux_coefficients[:-self.min_power]) self.fit_coefficients.extend([a_0, a_1]) self.fit_coefficients.extend(self.aux_coefficients[-self.min_power:]) self.source_matrix = np.vstack([ data_frame.temp**i if i != 0 else np.ones(len(data_frame.temp)) for i in range(self.min_power, self.max_power + 1) ]).T self.fit = np.dot(self.source_matrix, self.fit_coefficients) self.heat_capacity_matrix = np.vstack([ i * data_frame.temp**(i - 1) for i in range(self.min_power, self.max_power + 1) ]).T self.fit_heat_capacity = np.dot(self.heat_capacity_matrix, self.fit_coefficients)
y_new #%% Stats Models import numpy as np import statsmodels.api as sm from statsmodels.tools import add_constant x = [[0, 1], [5, 1], [15, 2], [25, 2], [35, 11], [45, 15], [55, 34], [60, 35]] x y = [4, 5, 20, 14, 32, 22, 38, 43] y x = sm.add_constant(x) #constant term of 1 added x model3 = sm.OLS(y, x) model3 results = model3.fit() results results.summary() results.rsquared #coeff of determination results.rsquared_adj results.params #bo, b1, b2 results.fittedvalues results.predict(x) #%%AIC & BIC #https://pypi.org/project/RegscorePy #pip install RegscorePy import RegscorePy
''' RSquare and Adjusted RSquare ''' SS_Residual = sum((y_train - y_pred)**2) SS_total = sum((y_train - np.mean(y_train))**2) R_Square = 1 - (float(SS_Residual)) / SS_total Adjusted_R_Square = 1 - ( 1- R_Square)*(len(y_train)-1)/ (len(y_train)-X_train.shape[1]-1) #------------------------------------------------# import statsmodels.api as sm X1 = sm.add_constant(X_train) result = sm.OLS(y_train,X1).fit() print('R Square : ',result.rsquared) print('Adjusted R Square: ',result.rsquared_adj)
def loadings_matrix( X, Y, feature_selection="stepwise", stepwise="Forward", criterion="pvalue", threshold=0.05, n_components=0.95, verbose=False, ): r""" Estimate the loadings matrix using stepwise regression. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. Y : DataFrame of shape (n_samples, n_assets) Target matrix, where n_samples in the number of samples and n_assets is the number of assets. feature_selection: str 'stepwise' or 'PCR', optional Indicate the method used to estimate the loadings matrix. The default is 'stepwise'. stepwise: str 'Forward' or 'Backward', optional Indicate the method used for stepwise regression. The default is 'Forward'. criterion : str, can be {'pvalue', 'AIC', 'SIC', 'R2' or 'R2_A'} The default is 'pvalue'. The criterion used to select the best features: - 'pvalue': select the features based on p-values. - 'AIC': select the features based on lowest Akaike Information Criterion. - 'SIC': select the features based on lowest Schwarz Information Criterion. - 'R2': select the features based on highest R Squared. - 'R2_A': select the features based on highest Adjusted R Squared. threshold : scalar, optional Is the maximum p-value for each variable that will be accepted in the model. The default is 0.05. n_components : int, float, None or str, optional if 1 < n_components (int), it represents the number of components that will be keep. if 0 < n_components < 1 (float), it represents the percentage of variance that the is explained by the components keeped. See `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_ for more details. The default is 0.95. verbose : bool, optional Enable verbose output. The default is False. Returns ------- loadings : DataFrame A DataFrame with the loadings matrix. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") if not isinstance(Y, pd.DataFrame): raise ValueError("Y must be a DataFrame") rows = Y.columns.tolist() cols = X.columns.tolist() cols.insert(0, "const") loadings = np.zeros((len(rows), len(cols))) loadings = pd.DataFrame(loadings, index=rows, columns=cols) for i in rows: if feature_selection == "stepwise": if stepwise == "Forward": included = forward_regression(X, Y[i], criterion=criterion, threshold=threshold, verbose=verbose) elif stepwise == "Backward": included = backward_regression(X, Y[i], criterion=criterion, threshold=threshold, verbose=verbose) else: raise ValueError("Choose and adecuate stepwise method") results = sm.OLS(Y[i], sm.add_constant(X[included])).fit() params = results.params loadings.loc[i, params.index.tolist()] = params.T elif feature_selection == "PCR": beta = PCR(X, Y[i], n_components=n_components) beta = pd.Series(np.ravel(beta), index=cols) loadings.loc[i, cols] = beta.T return loadings