Example #1
0
    def _estimator_relationship(self):
        evaluation_dataframe = pd.DataFrame(
            0.0,
            index=self.feature_value_dataframe.columns,
            columns=self.delta_dataframe.columns)
        p_value_dict = {}
        if self.estimator == 'correlation':
            for delta_col in self.delta_dataframe.columns:
                for feature_col in self.feature_value_dataframe.columns:
                    correlation, pvalue = spearmanr(
                        self.feature_value_dataframe[feature_col],
                        self.delta_dataframe[delta_col],
                        nan_policy='omit')
                    if pd.isnull(correlation):
                        correlation = 0
                        pvalue = 1
                    evaluation_dataframe.at[feature_col,
                                            delta_col] = correlation
                    p_value_dict[(feature_col, delta_col)] = pvalue
        elif self.estimator == 'regression':
            X = self.feature_value_dataframe.values
            for delta_col in self.delta_dataframe.columns:
                y = self.delta_dataframe[delta_col].values

                ols = linear_model.LinearRegression()
                ols.fit(X, y)
                evaluation_dataframe[delta_col] = ols.coef_

                for feature_col, p_val in zip(
                        self.feature_value_dataframe.columns,
                        stats.coef_pval(ols, X, y)):
                    p_value_dict[(feature_col, delta_col)] = p_val

        return evaluation_dataframe, p_value_dict
Example #2
0
def BAYESIAN(x: np.ndarray, y: np.ndarray) -> Tuple[int, int]:
    clf = BayesianRidge()
    clf.fit(x, y)
    m, q = clf.coef_[0], clf.intercept_
    mean = clf.predict(y.reshape(-1, 1))
    # This it's not an actual probability but it should be interpretable as one.
    try:
        p = 1 - np.nanmean(stats.coef_pval(clf, x, y))
    except np.linalg.LinAlgError:
        p = np.nan
    return m, q, p
Example #3
0
    def _modified_regressor_summary(clf, X, y, xlabels=None):
        """
        Output summary statistics for a fitted regression model.

        Parameters
        ----------
        clf : sklearn.linear_model
            A scikit-learn linear model classifier with a `predict()` method.
        X : numpy.ndarray
            Training data used to fit the classifier.
        y : numpy.ndarray
            Target training values, of shape = [n_samples].
        xlabels : list, tuple
            The labels for the predictors.
        """
        # Check and/or make xlabels
        ncols = X.shape[1]
        if xlabels is None:
            xlabels = np.array(
                ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
        elif isinstance(xlabels, (tuple, list)):
            xlabels = np.array(xlabels, dtype='str')
        # Make sure dims of xlabels matches dims of X
        if xlabels.shape[0] != ncols:
            raise AssertionError(
                "Dimension of xlabels {0} does not match "
                "X {1}.".format(xlabels.shape, X.shape))
        # Create data frame of coefficient estimates and associated stats
        coef_df = pd.DataFrame(
            index=['_intercept'] + list(xlabels),
            columns=['Estimate', 'Std. Error', 't value', 'p value']
        )
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
        coef_df['Std. Error'] = np.round(stats.coef_se(clf, X, y), 6)
        coef_df['t value'] = np.round(stats.coef_tval(clf, X, y), 4)
        coef_df['p value'] = np.round(stats.coef_pval(clf, X, y), 6)
        # Create data frame to summarize residuals
        resids = stats.residuals(clf, X, y, r_type='raw')
        resids_df = pd.DataFrame({
            'Min': pd.Series(np.round(resids.min(), 4)),
            '1Q': pd.Series(np.round(np.percentile(resids, q=25), 4)),
            'Median': pd.Series(np.round(np.median(resids), 4)),
            '3Q': pd.Series(np.round(np.percentile(resids, q=75), 4)),
            'Max': pd.Series(np.round(resids.max(), 4)),
        }, columns=['Min', '1Q', 'Median', '3Q', 'Max'])

        return resids_df, coef_df, {'R2': stats.metrics.r2_score(y, clf.predict(X)), 'Adj R2': stats.adj_r2_score(clf, X, y),
                                    'F-statistic': stats.f_stat(clf, X, y)}
    def coefficient_picks(self, coeff, type_of_reg):
        print("-----------------" + type_of_reg + "-----------------")
        num_of_nonzero = np.sum(np.abs(coeff.astype(float) != 0))
        num_of_zero = coeff.size - num_of_nonzero

        print(type_of_reg + " picked " + str(num_of_nonzero) +
              " variables and eliminated the other " + str(num_of_zero) +
              " variables")

        print(
            "It picked these columns with these p-values \n",
            pd.concat([
                pd.DataFrame(self.X.columns[coeff.astype(float) != 0.00]),
                pd.DataFrame(
                    stats.coef_pval(self.regr_dict[type_of_reg], self.X_test,
                                    self.y_test)[1:][coeff.astype(float) != 0])
            ],
                      axis=1))
Example #5
0
def LeastR(bt, PID, count):

    X = DrugUse.loc[PID][DrugUse.columns]

    nn = pd.value_counts(X.index).to_frame()
    nn = nn.sort_index()
    nn = nn.values
    S = np.array(nn) - 1

    X = X.values
    Y = np.array(LabTest.loc[PID]['Lab Test Value'])
    Y = Y.reshape(-1, 1)

    Xbar = Xava.loc[PID]
    Ybar = Yava.loc[PID]

    Xbar = Xbar.values
    Ybar = Ybar.values
    Ybar = Ybar.reshape(-1, 1)

    t = np.random.randn(Y.shape[0], 1)

    Z = buildZ(nn, (Y.shape[0], len(PID)))
    # D = buildD(S, (np.sum(S), Y.shape[0]))

    delta = X - Z.dot(Xbar)
    Phi = Y - Z.dot(Ybar) - t

    # delta = D.dot(X)
    # Phi = D.dot(Y)

    Phi = Phi.reshape(-1)

    reg = linear_model.LassoCV(alphas=[0.0039], cv=5)
    reg.fit(delta, Phi)
    bt_new = reg.coef_.reshape(-1, 1)

    p_value = stats.coef_pval(reg, delta, Phi)[1:]
    p_value = p_value.reshape(-1, 1)
    bt_new = (bt * count) / (count + 1) + bt_new / (count + 1)

    return bt_new, p_value
Example #6
0
def get_coefficients(data: pd.DataFrame):
    # cleaning data thien's way and applying lr
    d3 = data

    d3['data_lagged7'] = (d3.sort_values(by=['date'], ascending=True)
                           .groupby(['campaign_name'])['common_cost'].shift(7))
    d3['data_lagged1'] = (d3.sort_values(by=['date'], ascending=True)
                           .groupby(['campaign_name'])['common_cost'].shift(1))
    d3['data_lagged2'] = (d3.sort_values(by=['date'], ascending=True)
                           .groupby(['campaign_name'])['common_cost'].shift(2))
    d3['data_lagged3'] = (d3.sort_values(by=['date'], ascending=True)
                           .groupby(['campaign_name'])['common_cost'].shift(3))


    d3=d3.dropna(axis=1, how='all')

    d3=d3.dropna(subset=['data_lagged7','data_lagged1','data_lagged2','data_lagged3'])
    d_final=d3[['campaign_name','date','data_lagged1','data_lagged7',
                'data_lagged2','facebookads_actions_post_reaction','facebookads_actions_landing_page_view',
                'facebookads_actions_link_click','facebookads_actions_leadgen_other']]


    X = d_final[data.columns & ['facebookads_actions_post_reaction',
            'facebookads_actions_landing_page_view',
             "facebookads_actions_link_click" ]]  # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
    Y = d_final['data_lagged1']

    # with sklearn
    regr = linear_model.LinearRegression()
    regr.fit(X, Y)
    print('Intercept: \n', regr.intercept_)
    print('Coefficients: \n', regr.coef_)

    from regressors import stats
    print("coef_pval:\n", stats.coef_pval(regr, X, Y))

    return {
        'facebookads_actions_post_reaction': regr.coef_[0],
        'facebookads_actions_landing_page_view': regr.coef_[1],
        "facebookads_actions_link_click": regr.coef_[2]
    }
sum_values = [0] * 10
for i in range(0, 1000):

    X_train, X_test = train_test_split(data, test_size=0.3)

    # Sets the dependant variables into their own data structures
    y_train = X_train["Generation [kWh]"]
    y_test = X_test["Generation [kWh]"]
    # Removes the dependant variables from the X sets
    X_train = X_train.drop(columns="Generation [kWh]")
    X_test = X_test.drop(columns="Generation [kWh]")

    ridge = Ridge(alpha=.005, normalize=True).fit(X_train, y_train)

    p_vals = stats.coef_pval(ridge, X_train, y_train)

    columns = []
    i = 0
    for column in X_train.columns:
        columns.append(column)

    i = 0
    p_vals = p_vals[0:10]
    for value in p_vals:
        #print(f'{column}: {p_vals[i]}')
        sum_values[i] += p_vals[i]
        i += 1

for i in range(0, len(sum_values)):
    sum_values[i] = sum_values[i] / 1000
Example #8
0
            regr = regression
            regr.fit(train_X, train_y)
            print(
                f'--------------------------------- {regression_name} ---------------------------------'
            )
            print('Train:', regr.score(train_X, train_y))
            print('Val:', regr.score(val_X, val_y))
            print('OOS:', regr.score(oos_X, oos_y))

            from regressors import stats
            #print(stats.summary(regr, train_X, train_y.squeeze(), train_X.columns.tolist()))
            print(
                stats.summary(regr, oos_X,
                              oos_y.squeeze().reshape(-1, 1),
                              oos_X.columns.tolist()))
            p_vals = pd.Series(stats.coef_pval(regr, oos_X,
                                               oos_y.squeeze().reshape(-1, 1)),
                               index=['_intercept'] + X_cols,
                               name=out['iter_step'])
            p_vals['Regression Model'] = regression_name
            p_value_summary = pd.concat((p_value_summary, p_vals), axis=1)

            best_model = 'LinearRegression'
            if regression_name == best_model:
                coef = np.concatenate(
                    (np.round(np.array([regr.intercept_]).reshape(-1, 1),
                              6), np.round((regr.coef_.reshape(-1, 1)),
                                           6))).squeeze()
                coef = pd.Series(coef,
                                 index=['_intercept'] + X_cols,
                                 name=out['iter_step'])
                coef['Regression Model'] = regression_name
Example #9
0
reg.score(Y, contents_value)

reg = LinearRegression(fit_intercept = False).fit(dwellings_type, contents_value)
reg.coef_
reg.score(dwellings_type, contents_value)


reg = LinearRegression(fit_intercept = False).fit(Y[['current_income']], contents_value)
reg.coef_
reg.score(Y[['current_income']], contents_value)

lin = Lasso(alpha=0.0000000000001,precompute=True,max_iter=10000,
            positive=True, random_state=9999, selection='random', fit_intercept = False)
lin.fit(dwellings_type, contents_value)
lin.coef_ 
stats.coef_pval(rr_scaled, X_train, Y_train)

# Define the Model
model = lambda b, X: b[0] * X.iloc[:, 0] + b[1] * X.iloc[:, 1] + b[2] * X.iloc[:, 2]
model = lambda b, X: b[0] * X.iloc[:, 0] + b[1] * X.iloc[:, 1] + b[2] * X.iloc[:, 2] + b[3] * X.iloc[:, 3]
model = lambda b, X: b[0] * X.iloc[:, 0]  * X.iloc[:, 3] + b[1] * X.iloc[:, 1]  * X.iloc[:, 3] + b[2] * X.iloc[:, 2]
obj = lambda b, Y, X: np.sum(np.abs(Y-model(b, X))**2)
bnds = [(0, None), (0, None), (0, None), (0, None)]
xinit = np.array([0, 0, 0, 0])
res = minimize(obj, args=(contents_value, dwellings_type), x0=xinit, bounds = bnds)
print(f"b1={res.x[0]}, b2={res.x[1]}, b3={res.x[2]}")

import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from regressors import stats

data = pd.read_csv("engajamento.csv", sep=';')

X = data[[
    'col', 'abert', 'aut', 'comp', 'conf', 'disp', 'freq', 'ident', 'visao',
    'val', 'op', 'org', 'rel'
]]

y = np.array(data['eng'])

reg = linear_model.Ridge(alpha=.5)
reg.fit(X, y)

ols = linear_model.LinearRegression()
ols.fit(X, y)

print("coef_pval:\n", stats.coef_pval(ols, X, y))

print("\n=========== SUMMARY ===========")
xlabels = [
    'col', 'abert', 'aut', 'comp', 'conf', 'disp', 'freq', 'ident', 'visao',
    'val', 'op', 'org', 'rel'
]
stats.summary(ols, X, y, xlabels)
Example #11
0
    def alpha_check(strategy, index_ticker="BSESN"):
        error = False
        success = False
        error_message_list = []
        output = ""
        message = "Request Recieved"

        if strategy:
            strategy_returns = StrategyReturns.objects.filter(
                strategy=strategy).order_by('date')
        else:
            error = True
            success = False
            # message = "Strategy missing!"
            error_message_list.append("Strategy missing!")

        if index_ticker:
            index_returns = IndexDailyReturn.objects.filter(
                index__ticker=index_ticker).order_by('date')

        else:
            error = True
            success = False
            # message = "Index Code missing!"
            error_message_list.append("Index Code missing!")

        if not error:
            date_list_strat = list(map(lambda x: x.date, strategy_returns))
            return_strat_list = list(
                map(lambda x: x.return_strategy, strategy_returns))
            df_strategy = pd.DataFrame(
                {
                    'Date': date_list_strat,
                    'Return Strategy': return_strat_list
                },
                columns=['Date', 'Return Strategy'])
            df_strategy['Excess Return Strategy'] = df_strategy[
                'Return Strategy'] - risk_free_rate

            date_list_index = list(map(lambda x: x.date, index_returns))
            return_index_list = list(map(lambda x: x.return_1d, index_returns))
            df_index = pd.DataFrame(
                {
                    'Date': date_list_index,
                    'Return Index': return_index_list
                },
                columns=['Date', 'Return Index'])
            df_index['Excess Return Index'] = df_index[
                'Return Index'] - risk_free_rate
            df_final = pd.merge(
                df_strategy,
                df_index[['Date', 'Return Index', 'Excess Return Index']],
                on='Date',
                how='left')

            X = df_final['Excess Return Index'].values.reshape(-1, 1)
            Y = df_final['Excess Return Strategy'].values.reshape(-1, 1)
            col_x_mean = np.nanmean(X, axis=0)
            inds_x = np.where(np.isnan(X))
            X[inds_x] = np.take(col_x_mean, inds_x[1])

            col_y_mean = np.nanmean(Y, axis=0)
            inds_y = np.where(np.isnan(Y))
            X[inds_y] = np.take(col_y_mean, inds_y[1])
            regressor = LinearRegression()
            regressor.fit(X, Y)
            beta = regressor.coef_
            alpha = regressor.intercept_
            p_values = stats.coef_pval(regressor, X, Y)
            alpha_significance = p_values[0]
            beta_significance = p_values[1]
            strategy.alpha = alpha
            strategy.alpha_significance = alpha_significance
            strategy.beta = beta
            strategy.beta_significance = beta_significance
            strategy.save()
            error = False
            success = True
            message = "Alpha, Beta calculated"

        else:
            error = True
            success = False
            message = "Function input incorrect"

        return {
            'output': output,
            'message': message,
            'error': error,
            'error_message_list': error_message_list,
            'success': success
        }
Example #12
0
 def calculate_p_values(self):
     return stats.coef_pval(self.model, self.params_df, self.result_nd)
Example #13
0
        trainy.append(x)
#We are not using random sampling here. Since we have multiple seasons for NBA Teams
#We are splitting the ten years of data to see how years 2009-2014 compare to 2014-19
#Then flip the script and see what happens to our fit.
#Do any variables demonstrate temporal model fit differences?
NBAX_train = NBA_x.iloc[trainx, :]
NBAX_test = NBA_x.iloc[trainy, :]
NBAY_train = NBA_y.iloc[trainx, :]
NBAY_test = NBA_y.iloc[trainy, :]

regressor = LinearRegression()
regressor.fit(NBAX_train, NBAY_train)  #training the algorithm

ols = linear_model.LinearRegression()
ols.fit(NBAX_train, NBAY_train)
print("coef_pval:\n", stats.coef_pval(ols, NBAX_train, NBAY_train))
Data1 = [
    stats.coef_tval(ols, NBAX_train, NBAY_train),
    stats.coef_pval(ols, NBAX_train, NBAY_train)
]
Data2 = [
    stats.coef_tval(ols, NBAX_test, NBAY_test),
    stats.coef_pval(ols, NBAX_test, NBAY_test)
]
stats.adj_r2_score(ols, NBAX_train, NBAY_train)
stats.adj_r2_score(ols, NBAX_test, NBAY_test)

#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)
Example #14
0
    )  #tree.DecisionTreeClassifier() #LogisticRegression()   #SVC(kernel="linear")  #tree.DecisionTreeClassifier() #LogisticRegression()#
    rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2))

    rfecv.fit(x_data, y_data)
    print('number of features selected:', rfecv.n_features_)

    x_new = rfecv.transform(x_data)

    selected_inds = rfecv.get_support(indices=True)
    selected_ranks = rfecv.ranking_

    selected_feats = [training_head[ind] for ind in selected_inds]
    #print(selected_feats)

    #print(rfecv.estimator_.coef_)
    pvals = stats.coef_pval(rfecv.estimator_, x_new, y_data)
    #print(pvals)

    cl1 = LogisticRegressionCV(cv=10, penalty='l2', fit_intercept=True)

    cl1.fit(x_data, y_data)

    coefs = cl1.coef_
    intercept = cl1.intercept_[0]

    pvals_cur = stats.coef_pval(cl1, x_data, y_data)

    all_headers = []
    all_headers.append('intercept')
    all_headers.extend(training_head)