def LassoLarsIC_df(X, y, criterion):
    """
    Passes the inputs into sklearn's LassoLarsIC model selection function.
    Returns the rss, intercept and all coefficients as a DataFrame 
    as well as a list containing the features with non-zero coefficients.
    """
    model = LassoLarsIC(criterion=criterion)
    model.fit(X, y.iloc[:, 0])

    model_rss = model.score(X, y.iloc[:, 0])
    results = ([model_rss] + [model.intercept_] + list(model.coef_))

    results_cols = ['rss', 'intercept'] + list(X.columns)
    results_dict = {'results': results}
    results_df = pd.DataFrame.from_dict(results_dict,
                                        orient="index",
                                        columns=results_cols)
    remaining_features = list(
        results_df.iloc[0][results_df.iloc[0] != 0].index[2:])

    return results_df, remaining_features
Exemple #2
0
def trainData(fileName):
    df = pd.read_csv(fileName, index_col='date')

    df = df.sort_index()
    df = df[[
        'open', 'high', 'close', 'low', 'volume', 'price_change', 'p_change',
        'ma5', 'ma10', 'ma20', 'v_ma5', 'v_ma10', 'v_ma20', 'turnover'
    ]]

    df = df[['open', 'high', 'low', 'close', 'volume']]
    df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100.0
    df['PCT_change'] = (df['close'] - df['open']) / df['open'] * 100.0
    df = df[['close', 'HL_PCT', 'PCT_change', 'volume']]
    # print(df.head())
    forecast_col = 'close'
    df.fillna(value=-99999, inplace=True)
    # forecast_out = int(math.ceil(0.01 * len(df)))
    forecast_out = 1
    # ??forecast_out???
    df['label'] = df[forecast_col].shift(-forecast_out)

    print(df.shape)
    print(df)
    X = np.array(df.drop(['label'], 1))

    X = preprocessing.scale(X)

    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    df.dropna(inplace=True)
    print(X)
    print(X_lately)
    y = np.array(df['label'])
    # print(y)
    print(X.shape)
    print(y.shape)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.2)

    clf = LassoLarsIC(max_iter=100)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    joblib.dump(clf, "%s.m" % fileName)
    print(accuracy, "---------score------")

    forecast_set = clf.predict(X_lately)

    print(forecast_out)
    style.use('ggplot')
    df['Forecast'] = np.nan
    last_date = df.iloc[-1].name

    date_time = datetime.datetime.strptime(last_date, '%Y-%m-%d')
    last_unix = date_time.timestamp()
    one_day = 86400
    next_unix = last_unix + one_day
    print(forecast_set)
    for i in forecast_set:
        next_date = datetime.datetime.fromtimestamp(next_unix)
        next_unix += 86400
        df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]
    print(df.tail(forecast_out))

    df['close'].plot()
    df['Forecast'].plot()
    plt.show()
Exemple #3
0
def modeling(x_train,
             x_test,
             y_train,
             y_test,
             poly_order=1,
             criterion='bic',
             iterations=1000,
             lars_ic=False,
             lasso_alpha=None,
             kfold=True,
             k_n_splits=2,
             k_scoring='r2',
             var_name=None,
             scale=True):
    """
    Function that produces a tuple of linear regression results plus train and test data. Assumes that
    data has been split already. Default arguments will return a model trained on scaled data using 
    LASSO linear regression with K fold cross validation. 
    x_train - input variables for model training (expects pandas Series/DataFrame)
    x_test -  input variables for model testing (expects pandas Series/DataFrame)
    y_train - target variables for model training (expects pandas Series)
    y_train - target variables for model training (expects pandas Series)
    poly_order - order of polynomial transform to be applied to x_train and x_test
    criterion - which information criterion will be used to compute best model; default is BIC
    iterations - number of iterations for minimizing cost function
    lars_ic - (bool) whether the Sklearn LassoLars return an information criterion and uses it to 
    determine the optimal alpha
    kfold - (bool) whether to use the Sklearn KFold object to cross validate training data
    k_n_splits - how many k splits to use when doing KFold validation
    k_scoring - what metric of model fit the KFold object should return, default is R2
    var_name - what name the variable being tested will have in the pandas DataFrame produced. If None, 
    will default to the name of the y_test series
    scaling - (bool) whether to scale the data or not; uses StandardScaler. 
    
    Function returns a tuple of objects. Printout for every option indicates what the respective 
    indices are for accessing different items. 
    """

    if var_name == None:
        var_name = f'{y_test.name[0:4]}_polyO{str(poly_order)}_{k_n_splits}ksplits'
        if iterations != 1000:
            var_name = f'{y_test.name[0:4]}_polyO{str(poly_order)}_{k_n_splits}ksplits_iter{iterations}'

    # Using scaling function to scale features
    if scale:
        x_train_scaled, x_test_scaled = scale_features(x_train, x_test)
    else:
        x_train_scaled, x_test_scaled = x_train, x_test

    # Producing Polynomial Features (1 being linear regression)
    poly = PolynomialFeatures(poly_order)
    x_poly_train = poly.fit_transform(x_train_scaled)
    x_poly_test = poly.transform(x_test_scaled)

    if lars_ic:

        lars_poly = LassoLarsIC(
            criterion=criterion,
            fit_intercept=True,
            normalize=False,
            max_iter=iterations,
        )
        fit = lars_poly.fit(x_poly_train, y_train)
        score = lars_poly.score(x_poly_test, y_test)
        ic_score = np.mean(lars_poly.criterion_)
        optimal_alpha = lars_poly.alpha_

        if kfold:

            crossval = KFold(n_splits=k_n_splits,
                             shuffle=True,
                             random_state=42)
            cvs = cross_val_score(lars_poly,
                                  x_poly_train,
                                  y_train,
                                  scoring=k_scoring,
                                  cv=crossval)
            cvs_mean_score = np.mean(cvs)

            print(
                f'''The R-2 for a LASSO Least Angle Regression model with with a Polynomial Order of {poly_order} is {score}.\n The model with the lowest {criterion} of {ic_score} has a LASSO alpha of {optimal_alpha} \n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object  \n  1 - training X data (np array) \n 2 - testing X data (np array)  \n 3  -  Model results table (pandas DataFrame obj) \n  4  -  training Y data (np array)  \n  5  -  testing Y data (np array)'''
            )

            return lars_poly, x_poly_train, x_poly_test, pd.DataFrame(
                data=[[score, ic_score, optimal_alpha, cvs_mean_score]],
                columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'],
                index=[var_name]), y_train, y_test

        else:
            print(
                f'''The R-2 for a LASSO Least Angle Regression model with with a Polynomial Order of {poly_order} is {score}.\n The model with the lowest {criterion} of {ic_score} has a LASSO alpha of {optimal_alpha}\n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object  \n  1 - training X data (np array) \n 2 - testing X data (np array)  \n 3  -  Model results table (pandas DataFrame obj) \n  4  -  training Y data (np array)  \n  5  -  testing Y data (np array) '''
            )

            return lars_poly, x_poly_train, x_poly_test, pd.DataFrame(
                data=[[score, ic_score, optimal_alpha]],
                columns=['R2', f'{criterion}', 'Optimal_alpha'],
                index=[var_name]), y_train, y_test

    elif not lars_ic:

        lasso_reg = Lasso(alpha=lasso_alpha,
                          normalize=False,
                          max_iter=iterations,
                          random_state=42)
        fit = lasso_reg.fit(x_poly_train, y_train)
        score = lasso_reg.score(x_poly_test, y_test)

        if kfold:

            crossval = KFold(n_splits=k_n_splits,
                             shuffle=True,
                             random_state=42)
            cvs = cross_val_score(lasso_reg,
                                  x_poly_train,
                                  y_train,
                                  scoring=k_scoring,
                                  cv=crossval)
            cvs_mean_score = np.mean(cvs)

            print(
                f'''The R-2 for a model with with a Polynomial Order of {poly_order} and a Lasso Alpha of {lasso_alpha} is {np.round(score,4)}.\n  Function returns a tuple indexed as follows:  \n  0 - Sklearn lasso-regression object  \n  1 - training X data (np array) \n 2 - testing X data (np array) \n   3  -  Model results table (pandas DataFrame obj)  \n  4  -  training Y data (np array)  \n  5  -  testing Y data (np array) '''
            )

            return lasso_reg, x_poly_train, x_poly_test, pd.DataFrame(
                data=[[score, None, None, cvs_mean_score]],
                columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'],
                index=[var_name]), y_train, y_test

        else:

            print(
                f'''The R-2 for a model with with a Polynomial Order of {poly_order} and a Lasso Alpha of {lasso_alpha} is {np.round(score,4)}.\n  Function returns a tuple indexed as follows:  \n  0 - Sklearn lasso-regression object  \n  1 - training X data (np array) \n 2 - testing X data (np array) \n  3  -  Model results table (pandas DataFrame obj)  \n  4  -  training Y data (np array)  \n  5  -  testing Y data (np array) '''
            )

            return lasso_reg, x_poly_train, x_poly_test, pd.DataFrame(
                data=[[score, None, None, None]],
                columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'],
                index=[var_name]), y_train, y_test
##  'OPEC': -1.0037125526070625,
##  'PRS International Country Risk Guide': 0.0,
##  'South_American': 1.1666702294227076,
##  'World Economic Forum EOS': -1.1639115442413683,
##  'Years_In_Nato': 0.0,
##  'alcconsumption': 0.59855758131369263,
##  'armedforcesrate': 0.0,
##  'employrate': -2.2695726938628469,
##  'femaleemployrate': 1.0671515028671372,
##  'incomeperperson': 1.191656220279911,
##  'internetuserate': -2.4535120774767076,
##  'lifeexpectancy': 0.0}

from sklearn.metrics import mean_squared_error
train_error_aic = mean_squared_error(tar_train, model_aic.predict(pred_train))
test_error_aic = mean_squared_error(tar_test, model_aic.predict(pred_test))
print ('training data MSE')
print(train_error_aic)
print ('test data MSE')
print(test_error_aic)


# R-square from training and test data
rsquared_train_aic=model_aic.score(pred_train,tar_train)
rsquared_test_aic=model_aic.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train_aic)
print ('test data R-square')
print(rsquared_test_aic)

#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsIC
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = LassoLarsIC(criterion="aic")
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
Exemple #6
0
    print "R^2: ", r2

    print "\n**********测试LassoLarsIC类**********"
    lassoLarsIC = LassoLarsIC()
    # lassoLarsIC = LassoLarsIC(criterion='bic')
    # 拟合训练集
    lassoLarsIC.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    print "系数:", lassoLarsIC.coef_
    print "截距:", lassoLarsIC.intercept_
    print '训练集R2: ', r2_score(train_Y, lassoLarsIC.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = lassoLarsIC.predict(test_X)
    print "测试集得分:", lassoLarsIC.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, lassoLarsIC.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试ElasticNet类**********"
    # 在初始化ElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5.
    elasticNet = ElasticNet(alpha=1.0, l1_ratio=0.5)
    # 拟合训练集
    elasticNet.fit(train_X, train_Y)