Beispiel #1
0
# Isolate the MSFT returns
msft_returns = all_returns.iloc[all_returns.index.get_level_values('Ticker') ==
                                'MSFT']
msft_returns.index = msft_returns.index.droplevel('Ticker')

# Build up a new DataFrame with AAPL and MSFT returns
# concat along the column axis, without the index
# added [1:] so that NaN values won't interfere with model
return_data = pd.concat([aapl_returns, msft_returns], axis=1)[1:]
return_data.columns = ['AAPL', 'MSFT']

# Add a a column of ones to an array
X = sm.add_constant(return_data['AAPL'])
# Construct the model
model = sm.OLS(return_data['MSFT'], X).fit()
# Print the summary
# print(model.summary())

# Plot returns of AAPL and MSFT
# plt.plot(return_data['AAPL'], return_data['MSFT'], 'r.')
# # Add an axis to the plot
# ax = plt.axis()
# # Initialize `x` (x axis)
# x = np.linspace(ax[0], ax[1] + 0.01)
# # # Plot the regression line
# plt.plot(x, model.params[0] + model.params[1] * x, 'b', lw=2)

# # # Customize the plot
# plt.grid(True)
# plt.axis('tight')
import statsmodels.api as sm
from scipy import mstats
from statsmodels.sandbox.regression.predstd import wls_prediction_std
X, Y = df[['Open']], df[['Close']]
X = sm.add_constant(X)
slope, intercept, r_value, p_value, std_err = stats.linregress(X.iloc[:, 0], Y.iloc[:, 0])

yhat = slope*X.iloc[:, 0] + intercept #this is the regression line
Xf['yhat'] = yhat
Xf[['Close', 'yhat']].plot()
yhat.plot()




model = sm.OLS(Y,X)
result = model.fit()
print(result.summary())
prstd, lower, upper = wls_prediction_std(result)
Xf[['Close', prstd, lower, upper]].plot()
prstd = pd.DataFrame(prstd, index = Xf.index, columns = ['prstd'])
lower = pd.DataFrame(lower, index = Xf.index, columns = ['lower'])
upper = pd.DataFrame(upper, index = Xf.index, columns = ['upper'])
Xf['prstd'] = prstd
Xf['lower'] = lower
Xf['upper'] = upper
Xf[['Close', 'prstd', 'lower', 'upper']].plot()

#%% REGRESSION LINE

#import statsmodels.api as sm
    def fit(self, data_frame: DataFrame):
        self.data_frame = data_frame

        t_ref = data_frame.reference_temperature
        t_ref_vector = t_ref * np.ones(len(data_frame.temp))

        c_0 = data_frame.reference_value
        c_1 = data_frame.reference_cvalue

        (self.aux_values, self.aux_weights) = auxiliary_function(data_frame)

        updated_experiment = data_frame.experiment - c_0 * np.ones(len(data_frame.temp)) - \
                             c_1 * (data_frame.temp - t_ref_vector)

        self.updated_matrix = np.column_stack(
            [data_frame.temp ** i + (i - 1) * t_ref_vector ** i - i * data_frame.temp * t_ref_vector ** (i - 1) \
             for i in range(self.min_power, self.max_power + 1) if i not in [0, 1]])

        ols_result = sm.OLS(updated_experiment, self.updated_matrix).fit()
        # cooks_distance_influential = 4/(len(self.data_frame.temp - (self.max_power - self.min_power) - 1))
        # ols_cooks_distance = OLSInfluence(ols_result).cooks_distance[1]
        ols_stud_residuals = OLSInfluence(ols_result).dfbetas
        # ols_influence = OLSInfluence(ols_result).influence

        w = np.ones(len(data_frame.temp))
        for residual, weight in zip(ols_stud_residuals, w):
            if residual > 2:
                w = 0.1
            else:
                w = 1

        self.aux_fit = sm.WLS(updated_experiment,
                              self.updated_matrix,
                              weights=w).fit()

        self.aux_coefficients = self.aux_fit.params

        a_1 = c_1 - \
              sum([i * self.aux_coefficients[i - self.min_power] * t_ref ** (i - 1) \
                   for i in range(self.min_power, 0)]) - \
              sum([i * self.aux_coefficients[i - 2 - self.min_power] * t_ref ** (i - 1) \
                   for i in range(2, self.max_power + 1)])
        a_0 = c_0 - a_1 * t_ref - \
              sum([self.aux_coefficients[i - self.min_power] * t_ref ** i \
                   for i in range(self.min_power, 0)]) - \
              sum([self.aux_coefficients[i - 2 - self.min_power] * t_ref ** i \
                   for i in range(2, self.max_power + 1)])

        self.fit_coefficients = []
        self.fit_coefficients.extend(self.aux_coefficients[:-self.min_power])
        self.fit_coefficients.extend([a_0, a_1])
        self.fit_coefficients.extend(self.aux_coefficients[-self.min_power:])

        self.source_matrix = np.vstack([
            data_frame.temp**i if i != 0 else np.ones(len(data_frame.temp))
            for i in range(self.min_power, self.max_power + 1)
        ]).T

        self.fit = np.dot(self.source_matrix, self.fit_coefficients)

        self.heat_capacity_matrix = np.vstack([
            i * data_frame.temp**(i - 1)
            for i in range(self.min_power, self.max_power + 1)
        ]).T
        self.fit_heat_capacity = np.dot(self.heat_capacity_matrix,
                                        self.fit_coefficients)
Beispiel #4
0
y_new

#%% Stats Models

import numpy as np
import statsmodels.api as sm

from statsmodels.tools import add_constant
x = [[0, 1], [5, 1], [15, 2], [25, 2], [35, 11], [45, 15], [55, 34], [60, 35]]
x
y = [4, 5, 20, 14, 32, 22, 38, 43]
y

x = sm.add_constant(x)  #constant term of 1 added
x
model3 = sm.OLS(y, x)
model3
results = model3.fit()
results
results.summary()
results.rsquared  #coeff of determination
results.rsquared_adj
results.params  #bo, b1, b2

results.fittedvalues
results.predict(x)

#%%AIC & BIC
#https://pypi.org/project/RegscorePy
#pip install RegscorePy
import RegscorePy
''' RSquare and Adjusted RSquare '''

SS_Residual = sum((y_train - y_pred)**2)

SS_total = sum((y_train - np.mean(y_train))**2)

R_Square = 1 - (float(SS_Residual)) / SS_total

Adjusted_R_Square = 1 - ( 1- R_Square)*(len(y_train)-1)/ (len(y_train)-X_train.shape[1]-1)

#------------------------------------------------#

import statsmodels.api as sm
X1 = sm.add_constant(X_train)

result = sm.OLS(y_train,X1).fit()

print('R Square : ',result.rsquared)
print('Adjusted R Square: ',result.rsquared_adj)











Beispiel #6
0
def loadings_matrix(
    X,
    Y,
    feature_selection="stepwise",
    stepwise="Forward",
    criterion="pvalue",
    threshold=0.05,
    n_components=0.95,
    verbose=False,
):
    r"""
    Estimate the loadings matrix using stepwise regression.        

    Parameters
    ----------
    X : DataFrame of shape (n_samples, n_features)
        Features matrix, where n_samples is the number of samples and 
        n_features is the number of features.
    Y : DataFrame of shape (n_samples, n_assets)
        Target matrix, where n_samples in the number of samples and 
        n_assets is the number of assets.
    feature_selection: str 'stepwise' or 'PCR', optional
        Indicate the method used to estimate the loadings matrix.
        The default is 'stepwise'.
    stepwise: str 'Forward' or 'Backward', optional
        Indicate the method used for stepwise regression.
        The default is 'Forward'.
    criterion : str, can be {'pvalue', 'AIC', 'SIC', 'R2' or 'R2_A'}
        The default is 'pvalue'. The criterion used to select the best features:
        
        - 'pvalue': select the features based on p-values.
        - 'AIC': select the features based on lowest Akaike Information Criterion.
        - 'SIC': select the features based on lowest Schwarz Information Criterion.
        - 'R2': select the features based on highest R Squared.
        - 'R2_A': select the features based on highest Adjusted R Squared.
    threshold : scalar, optional
        Is the maximum p-value for each variable that will be 
        accepted in the model. The default is 0.05.
    n_components : int, float, None or str, optional
        if 1 < n_components (int), it represents the number of components that
        will be keep. if 0 < n_components < 1 (float), it represents the
        percentage of variance that the is explained by the components keeped. 
        See `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_
        for more details. The default is 0.95.
    verbose : bool, optional
        Enable verbose output. The default is False.
        
    Returns
    -------
    loadings : DataFrame    
        A DataFrame with the loadings matrix.
        
    Raises
    ------
    ValueError
        When the value cannot be calculated.
        
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("X must be a DataFrame")

    if not isinstance(Y, pd.DataFrame):
        raise ValueError("Y must be a DataFrame")

    rows = Y.columns.tolist()
    cols = X.columns.tolist()
    cols.insert(0, "const")
    loadings = np.zeros((len(rows), len(cols)))
    loadings = pd.DataFrame(loadings, index=rows, columns=cols)

    for i in rows:
        if feature_selection == "stepwise":
            if stepwise == "Forward":
                included = forward_regression(X,
                                              Y[i],
                                              criterion=criterion,
                                              threshold=threshold,
                                              verbose=verbose)
            elif stepwise == "Backward":
                included = backward_regression(X,
                                               Y[i],
                                               criterion=criterion,
                                               threshold=threshold,
                                               verbose=verbose)
            else:
                raise ValueError("Choose and adecuate stepwise method")
            results = sm.OLS(Y[i], sm.add_constant(X[included])).fit()
            params = results.params
            loadings.loc[i, params.index.tolist()] = params.T
        elif feature_selection == "PCR":
            beta = PCR(X, Y[i], n_components=n_components)
            beta = pd.Series(np.ravel(beta), index=cols)
            loadings.loc[i, cols] = beta.T

    return loadings