Example #1
0
def ridge_regressor(df):
    """
    INPUT: Pandas dataframe
    OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients
    """
    y = df.pop("price").values
    X = df.values
    feature_names = df.columns
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

    clf = Ridge(alpha=1.0)
    clf.fit(xtrain, ytrain)

    score = clf.score(xtest, ytest)
    feat_imps = clf.coef_
    ypredict = clf.predict(xtest)
    mae = np.mean(np.absolute(ytest - ypredict))
    mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest)
    return (
        "R^2 is ",
        score,
        "RMSE is ",
        rmse,
        "MAE percent is ",
        mae_percent,
        "Feature coefficients are ",
        zip(feature_names, feat_imps),
    )
def compute_linear_model(mfs, measures):
    from sklearn.linear_model import Ridge
    from sklearn import linear_model

    # try different ones
    clf = Ridge(alpha = 1.0)
    #clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
    #clf = linear_model.LinearRegression()

    # explain fexp using BMD + the MFS data
    fexp = measures[:, measures.shape[1]-1]

    bmd = measures[:, 0]
    bmd = bmd.reshape((bmd.shape[0], 1))

    #print "BMD: ", bmd
    #print "FEXP: ", fexp
    #print "MFS; ", mfs

    #PCA
    #from sklearn.decomposition import PCA
    #pca = PCA(n_components=12)
    #pca.fit(mfs)
    #mfs_pca = pca.transform(mfs)

    X = np.hstack((bmd, mfs))
    clf.fit(X, fexp)

    # Results
    #print "Coefs:", clf.coef_
    print "Score (R^2):", clf.score(X, fexp)
Example #3
0
def ridgereg(a):
    print("Doing ridge regression")
    clf = Ridge(alpha=a)
    clf.fit(base_X, base_Y)
    print ("Score = %f" % clf.score(base_X, base_Y))
    clf_pred = clf.predict(X_test)
    write_to_file("ridge.csv", clf_pred)
Example #4
0
def comparaison_ridge_lasso(X,Y):
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
    clf_lasso = Lasso(selection='random', random_state=random.seed())
    clf_ridge = Ridge()
    clf_lasso.fit(X_train,Y_train)
    clf_ridge.fit(X_train,Y_train)
    score_lasso=clf_lasso.score(X_test,Y_test)
    score_ridge=clf_ridge.score(X_test,Y_test)
    print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
Example #5
0
def test_huber_better_r2_score():
    # Test that huber returns a better r2 score than non-outliers"""
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100)
    huber.fit(X, y)
    linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
    mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
    huber_score = huber.score(X[mask], y[mask])
    huber_outlier_score = huber.score(X[~mask], y[~mask])

    # The Ridge regressor should be influenced by the outliers and hence
    # give a worse score on the non-outliers as compared to the huber regressor.
    ridge = Ridge(fit_intercept=True, alpha=0.01)
    ridge.fit(X, y)
    ridge_score = ridge.score(X[mask], y[mask])
    ridge_outlier_score = ridge.score(X[~mask], y[~mask])
    assert_greater(huber_score, ridge_score)

    # The huber model should also fit poorly on the outliers.
    assert_greater(ridge_outlier_score, huber_outlier_score)
Example #6
0
def training_predict_ridge(df):

    results =[]
    #独立重复10次
    for train,test in KFold(len(df),n_folds = 10,shuffle = True):
        para = process_ridge(df.T[train].T)
        clf = Ridge(alpha = para)
        clf.fit(df[predictors].T[train].T,df[target1].T[train].values.ravel())

        sc = clf.score(df[predictors].T[test].T,df[target1].T[test].values.ravel())
        results.append(sc)
    return results
Example #7
0
def test_alpha_opti(X,Y,nb_tests):
    score_lasso=0
    score_ridge=0
    score_lasso_opti=0
    score_ridge_opti=0
    for i in range(0,nb_tests):
        X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
        clf_lasso = Lasso(selection='random', random_state=random.seed())
        clf_ridge = Ridge()
        clf_lasso.fit(X_train,Y_train)
        clf_ridge.fit(X_train,Y_train)
        score_lasso+=clf_lasso.score(X_test,Y_test)
        score_ridge+=clf_ridge.score(X_test,Y_test)
        clf_lasso_opti = Lasso(selection='random', random_state=random.seed(),alpha=0.1)
        clf_ridge_opti = Ridge(alpha=0.1)
        clf_lasso_opti.fit(X_train,Y_train)
        clf_ridge_opti.fit(X_train,Y_train)
        score_lasso_opti+=clf_lasso_opti.score(X_test,Y_test)
        score_ridge_opti+=clf_ridge_opti.score(X_test,Y_test)
    print("Lasso (opti - non-opti) : {:3.3f}%".format(100*(score_lasso_opti-score_lasso)/nb_tests))
    print("Ridge (opti - non-opti) : {:3.3f}%".format(100*(score_ridge_opti-score_ridge)/nb_tests))
Example #8
0
def _regression_surface(
        userdata,
        switch_indiceses,
        corpus,
        filename):
    """Analyze data and make plot of document position and length vs. labeling
    time.
    """
    doclengths = []
    positions = []
    times = []
    for user, data in userdata.items():
        curdoclengths = _get_doclengths_for_user(userdata, user, corpus)
        switch_indices = switch_indiceses[user]
        user_times = _build_data_times(user, data)
        for i in range(1, len(switch_indices)):
            if switch_indices[i] - switch_indices[i-1] == 16:
                doclengths.extend(
                    curdoclengths[switch_indices[i-1]:switch_indices[i]])
                positions.extend(np.arange(1, 17))
                times.extend(
                    user_times[switch_indices[i-1]:switch_indices[i]])
    doclengths = np.array(doclengths)
    positions = np.array(positions)
    times = np.array(times)
    model_inputs = np.stack((doclengths, positions), axis=-1)
    ridge_model = Ridge()
    ridge_model.fit(model_inputs, times)
    r2 = ridge_model.score(model_inputs, times)
    fig, axis = plt.subplots(1, 1)
    xdata = np.arange(1, 17)
    for doclength in [30, 50, 100, 200, 500, 1000]:
        inputs = np.stack((np.array([doclength]*len(xdata)), xdata), axis=-1)
        ydata = ridge_model.predict(inputs)
        axis.plot(
            xdata,
            ydata,
            linewidth=2,
            label=str(doclength))
        # apparently, all of the lines go down by 6.02762577314 from first
        # labeling time to 16th
        # axis.annotate(str(ydata[0] - ydata[-1]), (xdata[-1], ydata[-1]))
    box = axis.get_position()
    axis.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    legend = axis.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    legend.set_title('Document length (in tokens)')
    axis.set_title('$R^2=$'+str(r2))
    axis.set_xlabel('Document order')
    axis.set_ylabel('Time (seconds)')
    fig.savefig(filename, bbox_inches='tight')
Example #9
0
def regress( X, y, iterations = 10 ):
    ridge_model = Ridge( alpha=.1).fit(X,y)
    print("within sample R^2: "+str(ridge_model.score(X,y)))
    print('\n')

    linear_scores = []
    kernel_scores = []
    for i in range(iterations):
        ( X_train,
          X_test,
          y_train,
          y_test 
        ) = cross_validation.train_test_split( X, y, random_state=randint(0,100))

        model = Ridge( alpha=10.0 )
        model.fit(X_train,y_train)
        linear_scores.append(model.score(X_test,y_test))

    print ( 'linear scores:\tmean = '+
            str(np.average(linear_scores))+
            '\tstd dev = '+
            str(np.std(linear_scores))
          )
Example #10
0
def test_sag_regressor():
    """tests if the sag regressor performs well"""
    xmin, xmax = -5, 5
    n_samples = 20
    tol = .001
    max_iter = 20
    alpha = 0.1
    rng = np.random.RandomState(0)
    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)

    # simple linear function without noise
    y = 0.5 * X.ravel()

    clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter,
                 alpha=alpha * n_samples)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    score1 = clf1.score(X, y)
    score2 = clf2.score(X, y)
    assert_greater(score1, 0.99)
    assert_greater(score2, 0.99)

    # simple linear function with noise
    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()

    clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter,
                 alpha=alpha * n_samples)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    score1 = clf1.score(X, y)
    score2 = clf2.score(X, y)
    score2 = clf2.score(X, y)
    assert_greater(score1, 0.5)
    assert_greater(score2, 0.5)
Example #11
0
def build_model(train_file, test_file, attr_file, model_out, predictions_out, algorithm='ridge'):
  classifiers = ['ridge', 'linear', 'lasso', 'rf', 'en']
  if algorithm not in classifiers:
    raise NotImplementedError("only implemented algorithms: " + str(classifiers))

  train_data = pd.read_pickle(train_file)
  attrs = read_attrs(attr_file)

  target_attr = attrs[0]
  usable_attrs = attrs[1:]

  if algorithm == 'ridge':
    clf = Ridge()
  elif algorithm == 'linear':
    clf = LinearRegression()
  elif algorithm == 'lasso':
    clf = Lasso()
  elif algorithm == 'en':
    clf = ElasticNet()
  else:
    clf = RandomForestRegressor()

  clf.fit(train_data[usable_attrs], train_data[target_attr])

  test_data = pd.read_pickle(test_file)
  predictions = clf.predict(test_data[usable_attrs])
  errors = predictions - test_data[target_attr]

  prediction_results = test_data[[target_attr] + usable_attrs].copy()
  prediction_results['predicted'] = predictions
  prediction_results.to_pickle(predictions_out)

  print "Modeling '%s'" % target_attr
  print "   Train:", train_file, '(%d examples)' % len(train_data)
  print "   Test:", test_file, '(%d examples)' % len(test_data)
  print "Algorithm:", algorithm

  if hasattr(clf, 'coef_'):
    print 'Coefficients:'
    for i,c in enumerate(clf.coef_):
      print '    %-20s' % usable_attrs[i] + ':', '%20.4f' % c

  print 'MSE  : %10.4f' % np.mean(errors ** 2)
  print 'medSE: %10.4f' % np.median(errors ** 2)
  print 'SSE  : %10.4f' % np.sum(errors ** 2)
  print 'Variance score: %.4f' % clf.score(test_data[usable_attrs], test_data[target_attr])

  pickle.dump(clf, open(model_out, 'wb'))
Example #12
0
 def forward_selection(self, data, labels, weights, num_features):
     """Iteratively adds features to the model"""
     clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state)
     used_features = []
     for _ in range(min(num_features, data.shape[1])):
         max_ = -100000000
         best = 0
         for feature in range(data.shape[1]):
             if feature in used_features:
                 continue
             clf.fit(data[:, used_features + [feature]], labels,
                     sample_weight=weights)
             score = clf.score(data[:, used_features + [feature]],
                               labels,
                               sample_weight=weights)
             if score > max_:
                 best = feature
                 max_ = score
         used_features.append(best)
     return np.array(used_features)
def run_full_example(df, ridge_alpha=1.0, test_set_fraction=0.5):
    
    #convert Pandas DataFrame to a feature matrix
    X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather'])

    #split into training and test sets
    Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=test_set_fraction)
    print '# of training samples: {}'.format(len(ytrain))
    print '# of test samples: {}'.format(len(ytest))
    print 'alpha: {:.2f}'.format(ridge_alpha)    
    print ''

    #create a Ridge object
    rr = Ridge(alpha=ridge_alpha)

    #fit the training data
    rr.fit(Xtrain, ytrain)

    #print out the weights and their names
    for weight,cname in zip(rr.coef_, col_names):
        print "{}: {:.6f}".format(cname, weight)
    print "Intercept: {:.6f}".format(rr.intercept_)
    print ''

    #compute the prediction on the test set
    ypred = rr.predict(Xtest)

    #compute the sum-of-squares error on the test set, which is
    #proportional to the log likelihood
    sqerr = np.sum((ytest - ypred)**2) / len(ytest)
    print 'Normalized Sum-of-squares Error: {:.3f}'.format(sqerr)

    #compute the sum-of-squares error for a model that is just
    #comprised of the mean on the training set
    sqerr_mean_only = np.sum((ytest - ytrain.mean())**2) / len(ytest)
    print 'Normalized Sum-of-squares Error for mean-only: {:.3f}'.format(sqerr_mean_only)

    #print out the R-squared on the test set
    r2 = rr.score(Xtest, ytest)
    print "R-squared: {:.2f}".format(r2)
    print ''    
    def _random_search(self, random_iter, x, y):
        # Default Values
        alpha = 1.0
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {"alpha": uniform(loc=0.0001, scale=10-0.0001)}
            param_list = [{"alpha": alpha}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            for idx, d in enumerate(param_list):
                rr = Ridge(alpha=d["alpha"],
                           fit_intercept=True,
                           normalize=False,
                           copy_X=True,
                           max_iter=None,
                           tol=0.001,
                           solver='auto')

                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5,
                                     random_state=self._rng)
                rr.fit(train_x, train_y)
                sc = rr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    alpha = d['alpha']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using alpha: %f\n" % alpha)
        return alpha
def apply_ridge( X_train, Y_train, alpha=None ):
    alphas = [ alpha ]
    if not alpha: alphas = [ x for x in sorted(set([ alpha, 0.1, 1.0/3.0, 1.0, 10.0/3.0, 10.0 ])) if x]
    ALPHA_VALS = {}
    for a in alphas:
        model = Ridge(alpha=a, 
                      fit_intercept=True, 
                      normalize=False, 
                      copy_X=True, 
                      max_iter=None, 
                      tol=0.001, 
                      solver='auto')
        # sample_weights = [ 1.0/float(len(Y)) for x in Y ]
        model.fit( X_train, Y_train )# , sample_weight=sample_weights)
        R2 = model.score(X_train, Y_train)
        L2 = dot(model.coef_,model.coef_)
        ALPHA_VALS [a ] = [ a, R2, L2, [x for x in model.coef_] ]
        print "ALPHA: %.2f \t R^2=%7.4f \t L2_NORM(THETA)=%10.2f \t THETA[1:N]=%s" % ( a, R2, L2, model.coef_ )
    # A = sorted([ ALPHA_VALS[x] for x in ALPHA_VALS [ a, R2, L2, model.coef_[:] ], key=lambda x: x[1], reversed=True )
    Theta = [ float( model.intercept_ ) , ]
    Theta.extend( [ float( x ) for x in model.coef_])
    ( model, Theta, J, SCORE ) = performance_analysis( model, Theta, X_train, Y_train, debug=1 )
    return ( model, Theta, J, SCORE )
def example4():
    #generate the dataset 
    df = generate_solar_data(num_samples=1000)

    #convert Pandas DataFrame to a feature matrix
    X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather'])

    #split into training and test sets
    Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=0.5)

    #create a Ridge object
    rr = Ridge()

    #fit the training data
    rr.fit(Xtrain, ytrain)

    #print out the weights and their names
    for weight,cname in zip(rr.coef_, col_names):
        print "{}: {:.6f}".format(cname, weight)
    print "Intercept: {:.6f}".format(rr.intercept_)

    #print out the R-squared on the test set
    r2 = rr.score(Xtest, ytest)
    print "R-squared: {:.2f}".format(r2)
Example #17
0
print('Crime dataset')
print('linear model intercept: {}'.format(linreg.intercept_))
print('linear model coeff:\n{}'.format(linreg.coef_))
print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))

# ridge regression approach --------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0)

linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print('Crime dataset')
print('ridge regression linear model intercept: {}'.format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'.format(linridge.coef_))
print('R-squared score (training): {:.3f}'.format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0)))

# ridge regression with normalization approach --------------------------------

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print('Crime dataset')
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

#Lets build a Linear regression on Boston dataset
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X,y=mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Linreg = LinearRegression()
lr = Linreg.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

# Ridge regression-------------------------------------------------------------
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train,y_train)
print('Training set score : {}'.format(ridge.score(X_train,y_train)))
print('Test set score : {}'.format(ridge.score(X_test,y_test)))

"""
The Ridge model makes a trade-off between the simplicity of the model (near-zero
coefficients) and its performance on the training set. How much importance the
model places on simplicity versus training set performance can be specified by the
user, using the alpha parameter. In the previous example, we used the default parameter
alpha=1.0. There is no reason why this will give us the best trade-off, though.
The optimum setting of alpha depends on the particular dataset we are using.
Increasing alpha forces coefficients to move more toward zero, which decreases
training set performance but might help generalization. For example
"""
ridge10=Ridge(alpha=10).fit(X_train,y_train)
print('Training set score : {}'.format(ridge10.score(X_train,y_train)))
print('Test set score : {}'.format(ridge10.score(X_test,y_test)))
y = np.array(y_list)

# =============================================================================

# PERFORM ML PREDICTION

np.random.seed(0)
split_idxs = np.random.permutation(len(X))

# Split Data (Training Testing)
X_train = X[split_idxs[:-500]]
y_train = y[split_idxs[:-500]]
X_test = X[split_idxs[-500:]]
y_test = y[split_idxs[-500:]]

# Ridge Regression Classification
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train,y_train)

predictions = clf.predict(X_test);
print predictions
print clf.score(X_test,y_test)

text_file = open("Log.txt", "a")
ts = time.time()
text_file.write("Data collected from %s" % datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S\n'))
text_file.write("Prediction Score: %f\n\n" % clf.score(X_test,y_test))
text_file.close()
    avg_train_score = 0
    avg_test_score = 0

    target_data_file = "targets_%s.dat" % target
    print "Starting to train a model to predict %s..." % target.replace('_', ' ')
    target_matrix = cPickle.load(open('2013-04-20 183207/' + target_data_file, 'r'))
    print "Converting targets to CSR Matrix to make life easier..."
    target_matrix = np.array(target_matrix)

    kf = KFold(len(target_matrix), n_folds=3, indices=True, shuffle=True)
    for train_index, test_index in kf:
        print "Beginning Fold"
        kfold_train = feature_matrix[train_index]
        kfold_test = feature_matrix[test_index]
        kfold_train_target = target_matrix[train_index]
        kfold_test_target = target_matrix[test_index]
        #clf = SGDRegressor(n_iter=1000, shuffle=True)
        clf = Ridge()
        clf.fit(kfold_train, kfold_train_target)

        score_train = clf.score(kfold_train, kfold_train_target)
        score_test = clf.score(kfold_test, kfold_test_target)

        print "R^2 Score On Training Data:", score_train
        avg_train_score += score_train
        print "R^2 Score On Validation Data:", score_test
        avg_test_score += score_test
    avg_train_score = avg_train_score/3.0
    avg_test_score = avg_test_score/3.0
    print "Average Score on Training Data:", avg_train_score
    print "Average Score on Testing Data:", avg_test_score
Example #21
0
                      cv=10)
search.fit(Xs, ys)
search.best_params_

# In[17]:

######Ridge
X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    ys,
                                                    test_size=0.2,
                                                    random_state=10)
ridge = Ridge(alpha=1, normalize=False)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
# # Compute and print R^2 and RMSE
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(" Test Root Mean Squared Error: {}".format(rmse))

# In[30]:

y0_pred = ridge.predict(X_test)
y1_pred = ridge.predict(X_train)
# # Compute and print R^2 and RMSE
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse0 = np.sqrt(mean_squared_error(y_test, y0_pred))
rmse = np.sqrt(mean_squared_error(y_train, y1_pred))
print("Root Mean Squared Error for Test: {}".format(rmse0))
print("Root Mean Squared Error for Train: {}".format(rmse))

# In[164]:
Example #22
0
class Regressor():
    """
    Wraps scikitlearn regressors.


    Parameters
    ----------

    strategy : string, defaut = "LightGBM" (if installed else "XGBoost")
        The choice for the regressor.
        Available strategies = "LightGBM" (if installed), "XGBoost",
        "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"

    **params : parameters of the corresponding regressor.
        Examples : n_estimators, max_depth...

    """
    def __init__(self, **params):

        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            if (lgbm_installed):
                self.__strategy = "LightGBM"
            else:
                self.__strategy = "XGBoost"

        self.__regress_params = {}

        self.__regressor = None
        self.__set_regressor(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False

    def get_params(self, deep=True):

        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__regress_params)

        return params

    def set_params(self, **params):

        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_regressor(params['strategy'])

            for k, v in self.__regress_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)

        for k, v in params.items():
            if (k == "strategy"):
                pass
            else:
                if k not in self.__regressor.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)
                    self.__regress_params[k] = v

    def __set_regressor(self, strategy):

        self.__strategy = strategy

        if (strategy == 'RandomForest'):
            self.__regressor = RandomForestRegressor(n_estimators=400,
                                                     max_depth=10,
                                                     max_features='sqrt',
                                                     bootstrap=True,
                                                     n_jobs=-1,
                                                     random_state=0)

        elif (strategy == 'XGBoost'):
            self.__regressor = XGBRegressor(n_estimators=500,
                                            max_depth=6,
                                            learning_rate=0.05,
                                            colsample_bytree=0.8,
                                            colsample_bylevel=1.,
                                            subsample=0.9,
                                            nthread=-1,
                                            seed=0)

        elif (strategy == "LightGBM"):
            if (lgbm_installed):
                self.__regressor = LGBMRegressor(n_estimators=500,
                                                 learning_rate=0.05,
                                                 colsample_bytree=0.8,
                                                 subsample=0.9,
                                                 nthread=-1,
                                                 seed=0)
            else:
                warnings.warn(
                    "Package lightgbm is not installed. Model LightGBM will be"
                    "replaced by XGBoost")
                self.__strategy = "XGBoost"
                self.__regressor = XGBRegressor(n_estimators=500,
                                                max_depth=6,
                                                learning_rate=0.05,
                                                colsample_bytree=0.8,
                                                colsample_bylevel=1.,
                                                subsample=0.9,
                                                nthread=-1,
                                                seed=0)

        elif (strategy == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor(n_estimators=400,
                                                   max_depth=10,
                                                   max_features='sqrt',
                                                   bootstrap=True,
                                                   n_jobs=-1,
                                                   random_state=0)

        elif (strategy == 'Tree'):
            self.__regressor = DecisionTreeRegressor(
                criterion='mse',
                splitter='best',
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                min_weight_fraction_leaf=0.0,
                max_features=None,
                random_state=0,
                max_leaf_nodes=None,
                presort=False)

        elif (strategy == "Bagging"):
            self.__regressor = BaggingRegressor(base_estimator=None,
                                                n_estimators=500,
                                                max_samples=.9,
                                                max_features=.85,
                                                bootstrap=False,
                                                bootstrap_features=False,
                                                n_jobs=-1,
                                                random_state=0)

        elif (strategy == "AdaBoost"):
            self.__regressor = AdaBoostRegressor(base_estimator=None,
                                                 n_estimators=400,
                                                 learning_rate=.05,
                                                 random_state=0)

        elif (strategy == "Linear"):
            self.__regressor = Ridge(alpha=1.0,
                                     fit_intercept=True,
                                     normalize=False,
                                     copy_X=True,
                                     max_iter=None,
                                     tol=0.001,
                                     solver='auto',
                                     random_state=0)

        else:
            raise ValueError(
                "Strategy invalid. Please choose between 'LightGBM' "
                "(if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', "
                "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")

    def fit(self, df_train, y_train):
        """

        Fits Regressor.

        Parameters
        ----------

        df_train : pandas dataframe of shape = (n_train, n_features)
        The train dataset with numerical features.

        y_train : pandas series of shape = (n_train, )
        The target for regression tasks.


        Returns
        -------
        self

        """

        # sanity checks
        if ((type(df_train) != pd.SparseDataFrame)
                and (type(df_train) != pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__regressor.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self

    def feature_importances(self):
        """
        Computes feature importances. Regressor must be fitted before.

        Parameters
        ----------

        None

        Returns
        -------

        importance : dict
            Dictionnary containing a measure of feature importance (value)
            for each feature (key).

        """

        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}
                f = np.abs(self.get_estimator().coef_)

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in [
                    "LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree"
            ]):

                importance = {}
                f = self.get_estimator().feature_importances_

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    # XGB, RF, ET, Tree and AdaBoost
                    # TODO: Refactor this part
                    f = sum(
                        weight * est.feature_importances_
                        for weight, est in zip(
                            self.get_estimator().estimator_weights_,
                            self.get_estimator().estimators_)) / norm  # noqa

                except Exception:
                    f = sum(weight * np.abs(est.coef_) for weight, est in zip(
                        self.get_estimator().estimator_weights_,
                        self.get_estimator().estimators_)) / norm  # noqa

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        # XGB, RF, ET, Tree and AdaBoost
                        f = b.feature_importances_
                    except Exception:
                        f = np.abs(b.coef_)  # Linear

                    estimator = self.get_estimator()
                    items = enumerate(estimator.estimators_features_[i])
                    for j, c in items:
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    importance[col] = np.mean(
                        filter(lambda x: x != 0, [
                            k[col] if col in k else 0 for k in importance_bag
                        ]))

            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")

    def predict(self, df):
        '''

        Predicts the target.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        y : array of shape = (n, )
        The target to be predicted.

        '''

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")

    def transform(self, df):
        '''

        Transforms df.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        df_transform : pandas dataframe of shape = (n, n_selected_features)
        The transformed dataset with its most important features.

        '''

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.transform(df.values)
        else:
            raise ValueError("You must call the fit function before !")

    def score(self, df, y, sample_weight=None):
        """

        Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        score : float
        R^2 of self.predict(df) wrt. y.

        """

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame)
                    and (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if (type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__regressor.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")

    def get_estimator(self):
        return copy(self.__regressor)
Example #23
0
# create and train a few models
lr = LinearRegression(normalize=True)
lr.fit(X_train, Y_train)

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, Y_train)

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, Y_train)

rfr = RandomForestRegressor()
rfr.fit(X_train, Y_train)

mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=1000)
mlp.fit(X_train, Y_train)

# print model accuracy and comparasion
from sklearn.metrics import accuracy_score

acc_lr = lr.score(X_test, Y_test)
acc_lasso = lasso.score(X_test, Y_test)
acc_ridge = ridge.score(X_test, Y_test)
acc_rfr = rfr.score(X_test, Y_test)
acc_mlp = mlp.score(X_test, Y_test)

print "LinearRegression: ", acc_lr
print "Lasso: ", acc_lasso
print "Ridge: ", acc_ridge
print "RandomForestRegressor: ", acc_rfr
print "MLPRegressor: ", acc_mlp
Example #24
0
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=3)
print len(X_test), len(y_test)
lr = LinearRegression()
lr.fit(X_train, y_train)
rr = Ridge(
    alpha=0.01
)  # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)
rr100 = Ridge(alpha=100)  #  comparison with alpha value
rr100.fit(X_train, y_train)
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train, y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train, y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)
print "linear regression train score:", train_score
print "linear regression test score:", test_score
print "ridge regression train score low alpha:", Ridge_train_score
print "ridge regression test score low alpha:", Ridge_test_score
print "ridge regression train score high alpha:", Ridge_train_score100
print "ridge regression test score high alpha:", Ridge_test_score100
# plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
# plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
# plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
# plt.xlabel('Coefficient Index',fontsize=16)
# plt.ylabel('Coefficient Magnitude',fontsize=16)
# plt.legend(fontsize=13,loc=4)
Example #25
0
# Code starts here
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
r2_lasso = lasso.score(X_test, y_test)
print(r2_lasso)

# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
r2_ridge = ridge.score(X_test, y_test)
print(r2_ridge)
# Code ends here

# --------------
from sklearn.model_selection import cross_val_score

#Code starts here
regressor = LinearRegression()

# Initiate cross validation score
score = cross_val_score(regressor, X_train, y_train, scoring='r2', cv=10)
print(score)
#calculate mean of the score
mean_score = np.mean(score)
Example #26
0
def create_model(df, y, X, X_train, X_test, y_train, y_test, degree,
                 random_state, test_size, alpha):

    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    ss = StandardScaler()
    ss.fit(X_train)

    X_train_scaled = ss.transform(X_train)
    X_test_scaled = ss.transform(X_test)

    linreg_norm = LinearRegression()
    linreg_norm.fit(X_train_scaled, y_train)

    X_cat = df[['Month', 'Origin', 'Dest']]
    X_train_cat, X_test_cat, y_train, y_test = train_test_split(
        X_cat, y, test_size=test_size, random_state=random_state)
    # OneHotEncode Categorical variables
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(X_train_cat)

    X_train_ohe = ohe.transform(X_train_cat)
    X_test_ohe = ohe.transform(X_test_cat)

    columns = ohe.get_feature_names(input_features=X_train_cat.columns)
    cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)
    cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns)
    X_train_all = pd.concat([pd.DataFrame(X_train_scaled), cat_train_df],
                            axis=1)
    X_test_all = pd.concat([pd.DataFrame(X_test_scaled), cat_test_df], axis=1)
    linreg_all = LinearRegression()
    linreg_all.fit(X_train_all, y_train)

    print('Baseline model Continuous and Categorical')
    print('Training r^2:', linreg_all.score(X_train_all, y_train))
    print('Testing r^2:', linreg_all.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, linreg_all.predict(X_train_all)))
    print('Testing MSE:',
          mean_squared_error(y_test, linreg_all.predict(X_test_all)))

    print("\n")

    lasso = Lasso(alpha=alpha)  #Lasso is also known as the L1 norm.
    lasso.fit(X_train_all, y_train)
    print('Lasso')
    print('Training r^2:', lasso.score(X_train_all, y_train))
    print('Testing r^2:', lasso.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, lasso.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             lasso.predict(X_test_all)))

    print("\n")

    ridge = Ridge(alpha=alpha)  #Ridge is also known as the L2 norm.
    ridge.fit(X_train_all, y_train)
    print('Ridge')
    print('Training r^2:', ridge.score(X_train_all, y_train))
    print('Testing r^2:', ridge.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, ridge.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             ridge.predict(X_test_all)))

    print("\n")

    poly_features = PolynomialFeatures(degree)

    # transforms the existing features to higher degree features.
    X_train_poly = poly_features.fit_transform(X_train)

    # fit the transformed features to Linear Regression
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)

    # predicting on training data-set
    y_train_predicted = poly_model.predict(X_train_poly)

    # predicting on test data-set
    y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))

    # evaluating the model on training dataset
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    r2_train = r2_score(y_train, y_train_predicted)

    # evaluating the model on test dataset
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict))
    r2_test = r2_score(y_test, y_test_predict)

    print("\n")

    print(" Polynomial training set")

    print("MSE of training set is {}".format(rmse_train))
    print("R2 score of training set is {}".format(r2_train))

    print("\n")

    print("Polynomial test set")

    print("MSE of test set is {}".format(rmse_test))
    print("R2 score of test set is {}".format(r2_test))

    print("\n")

    print('Cross Validation for Polynomial model')

    lm = LinearRegression()

    # store scores in scores object
    # we can't use accuracy as our evaluation metric since that's only relevant for classification problems
    # RMSE is not directly available so we will use MSE
    scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='r2')
    mse_scores = cross_val_score(lm,
                                 X_train_poly,
                                 y_train,
                                 cv=10,
                                 scoring='neg_mean_squared_error')
    print('Cross Validation Mean r2:', np.mean(scores))
    print('Cross Validation Mean MSE:', np.mean(mse_scores))
    print('Cross Validation 10 Fold Score:', scores)
    print('Cross Validation 10 Fold mean squared error', -(mse_scores))
Example #27
0
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=66,
                                                    shuffle=True,
                                                    test_size=0.2)

from sklearn.linear_model import LinearRegression, Ridge, Lasso

# 모델
model1 = LinearRegression()
model2 = Ridge()
model3 = Lasso()

model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)

linear_score = model1.score(x_test, y_test)
ridge_score = model2.score(x_test, y_test)
lasso_score = model3.score(x_test, y_test)

# 평가
print('linear_score: ', linear_score)
print('ridge_score: ', ridge_score)
print('lasso_score: ', lasso_score)

# y_pred = model1.predict(x_test)
# print(y_pred)
Example #28
0
################################################## RIDGE REGRESSION

# PARAMETER TUNING

features = ['c1','c2','c3','c4','c5','c6','c7','c8']

msk = np.random.rand(len(tf)) < 0.8
train = tf[msk].reset_index(drop=True)
test = tf[~msk].reset_index(drop=True)

row_list = []

for n in range(0,1001):
    clf = Ridge(alpha=n)
    clf.fit(train[features],train.nrtg)
    score = clf.score(test[features],test.nrtg)
    dict1 = {'alpha':n,'score':score}
    row_list.append(dict1)
    
alpha_df = pd.DataFrame(row_list)

alpha = alpha_df[alpha_df.score == alpha_df.score.max()].alpha.values[0]

# RIDGE REGRESSION

clf = Ridge(alpha=alpha)

clf.fit(tf[features],tf.nrtg)

coefficients = clf.coef_
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn

X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
ridge = Ridge().fit(X_train, y_train)
print("[default value of alpha]")
print("training set score: %f" % ridge.score(X_train, y_train))
print("test set score: %f" % ridge.score(X_test, y_test))

# Model with high value of alpha (regularization parameter)
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("[alpha 10]")
print("training set score: %f" % ridge10.score(X_train, y_train))
print("test set score: %f" % ridge10.score(X_test, y_test))

# Model with low value of alpha
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("[alpha 0.1]")
print("training set score: %f" % ridge01.score(X_train, y_train))
print("test set score: %f" % ridge01.score(X_test, y_test))

plt.title("ridge_coefficients")
plt.plot(ridge.coef_, 'o', label="Ridge alpha=1")
plt.plot(ridge10.coef_, 'o', label="Ridge alpha=10")
plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.ylim(-25, 25)
plt.legend()
plt.show()

pdx = wine_quality[all_colnms]
pdy = wine_quality["quality"]

x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42)

alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]

initrsq = 0

print ("\nRidge Regression: Best Parameters\n")
for alph in alphas:
    ridge_reg = Ridge(alpha=alph) 
    ridge_reg.fit(x_train,y_train)    
    tr_rsqrd = ridge_reg.score(x_train,y_train)
    ts_rsqrd = ridge_reg.score(x_test,y_test)    

    if ts_rsqrd > initrsq:
        print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5))
        initrsq = ts_rsqrd

# Coeffients of Ridge regression of best alpha value
ridge_reg = Ridge(alpha=0.001) 
ridge_reg.fit(x_train,y_train) 
 

print ("\nRidge Regression coefficient values of Alpha = 0.001\n")
for i in range(11):
    print (all_colnms[i],": ",ridge_reg.coef_[i])
Example #31
0
#plt.show()

data=pd.read_csv('ridge.csv')
#绘制车流量信息
plt.plot(data['TRAFFIC_COUNT'])
plt.show()

X=data[data.columns[1:5]]#属性数据
y=data['TRAFFIC_COUNT']#车流量数据(即是要预测的数据)
poly=PolynomialFeatures(5)#测试后5是效果较好的一个参数
#X为创建的多项式特征
X=poly.fit_transform(X)
#将所有数据划分为训练集和测试集,test_size表示测试集的比例,random_state是随机数种子
train_set_X, test_set_X , train_set_y, test_set_y = cross_validation.train_test_split(X,y,test_size=0.3,random_state=0)
#创建岭回归实例
clf=Ridge(alpha=1.0,fit_intercept = True)
#调用fit函数使用训练集训练回归器
clf.fit(train_set_X,train_set_y)
#利用测试集计算回归曲线的拟合优度,clf.score返回值为0.7375
#拟合优度,用于评价拟合好坏,最大为1,无最小值,当对所有输入都输出同一个值时,拟合优度为0。
clf.score(test_set_X,test_set_y)

start=200 #接下来我们画一段200到300范围内的拟合曲线
end=300
y_pre=clf.predict(X) #是调用predict函数的拟合值
time=np.arange(start,end)
plt.plot(time,y[start:end],'b', label="real")
plt.plot(time,y_pre[start:end],'r', label='predict')
#展示真实数据(蓝色)以及拟合的曲线(红色)
plt.legend(loc='upper left') #设置图例的位置
plt.show()
Example #32
0
        # Initialize scikit-learn ridge regression model
        model_ridge_scikit = RidgeRegression(alpha=alpha)

        # Trains scikit-learn ridge regression model
        model_ridge_scikit.fit(x_poly_train, y_train)

        print('Results for scikit-learn RidgeRegression model with alpha={}'.
              format(alpha))

        # Test model on training set
        score_mse_ridge_scikit_train = score_mean_squared_error(
            model_ridge_scikit, x_poly_train, y_train)
        print('Training set mean squared error: {:.4f}'.format(
            score_mse_ridge_scikit_train))

        score_r2_ridge_scikit_train = model_ridge_scikit.score(
            x_poly_train, y_train)
        print('Training set r-squared scores: {:.4f}'.format(
            score_r2_ridge_scikit_train))

        # Save MSE and R-squared training scores
        scores_mse_ridge_scikit_train.append(score_mse_ridge_scikit_train)
        scores_r2_ridge_scikit_train.append(score_r2_ridge_scikit_train)

        # Test model on validation set
        score_mse_ridge_scikit_val = score_mean_squared_error(
            model_ridge_scikit, x_poly_val, y_val)
        print('Validation set mean squared error: {:.4f}'.format(
            score_mse_ridge_scikit_val))

        score_r2_ridge_scikit_val = model_ridge_scikit.score(x_poly_val, y_val)
        print('Validation set r-squared scores: {:.4f}'.format(
Example #33
0
print ("Linear regression (order 5) score is: {0}".format(lr_5_model.score(X_test_poly, y_test)))

plt.plot(xx, yy_poly)
plt.plot(X_test, y_test, "o")
plt.ylim([0, 30])
plt.title("Linear regression (order 5) result")
plt.show()


ridge_model = Ridge(alpha=1, normalize=False)
ridge_model.fit(X_train_poly, y_train)
yy_ridge = ridge_model.predict(xx_poly)

# Todo: write to report
print ("Ridge regression (order 5) score is: {0}".format(ridge_model.score(X_test_poly, y_test)))
print ("y2= {0} + {1} x + {2} x*x + {3} x*x*x + {4} x*x*x*x +{5} x*x*x*x*x".
       format(ridge_model.intercept_[0], ridge_model.coef_[0][0], ridge_model.coef_[0][1], ridge_model.coef_[0][2],
              ridge_model.coef_[0][3], ridge_model.coef_[0][4]))

plt.plot(xx, yy_ridge)
plt.plot(X_test, y_test, "o")
plt.ylim([0, 30])
plt.title("Ridge regression (order 5) result")
plt.show()

# Compare
# 1. The model with the highest score is: Ridge model (order 5)
# 2. Ridge model can prevent over-fitting: yes
# 3. Ridge model is nearly equivalent to LR model (order 5) if alpha=0: yes
# 4. A larger alpha results in a larger coefficient for x*x*x*x*x: no
Example #34
0
    linreg.score(X_train, y_train)))
print("R-Squared Value for Test Set: {:.3f}".format(
    linreg.score(X_test, y_test)))

# KNeighborsRegressor
knnreg = KNeighborsRegressor(n_neighbors=2)
knnreg.fit(X_train, y_train)

print('R-squared train score: {:.3f}'.format(knnreg.score(X_train, y_train)))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))

# Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(ridge.score(
    X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(ridge.score(X_test, y_test)))

# Lasso
lasso = Lasso(max_iter=10000)
lasso.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(lasso.score(
    X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(lasso.score(X_test, y_test)))

lasso = Lasso(alpha=100, max_iter=10000)
lasso.fit(train_processed, train['revenue'])
results = lasso.predict(test_processed)
results_2 = np.exp(results)
print(results_2)
Example #35
0
'''
В гребневой регрессии коэффициенты (w)
 выбираются не только с точки зрения того, 
 насколько хорошо они позволяют предсказывать 
 на обучающих данных, они еще подгоняются в 
 соответствии с дополнительным ограничением.
  Нам нужно, чтобы величина коэффициентов была как
 можно меньше. Другими словами, все элементы w
  должны быть близки к нулю. Это означает, 
  что каждый признак должен иметь как можно 
  меньшее влияние на результат 
'''

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.2f}".format(ridge.score(X_test, y_test)))

'''
Увеличение alpha заставляет коэффициенты сжиматься до близких к нулю значений, 
что снижает качество работы модели на обучающем наборе,
но может улучшить ее обобщающую способность. 
'''
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.2f}".format(ridge10.score(X_test, y_test)))

#При очень малых значениях alpha, ограничение на коэффициенты практически не накладывается 
#и мы в конечном итоге получаем модель, напоминающую линейную регрессию
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge01.score(X_train, y_train)))
Example #36
0
#Evaluate the model
plt.figure(figsize=(15, 10))

ft_importances_lm.plot(kind='barh')
plt.show()

#R2 Value

print("RSquare Value for Simple Regresssion TEST data is-")
print(np.round(lm.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Lasso Regresssion TEST data is-")
print(np.round(lm_lasso.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Ridge Regresssion TEST data is-")
print(np.round(lm_ridge.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Elastic Net Regresssion TEST data is-")
print(np.round(lm_elastic.score(features_test, labels_test) * 100, 2))

#Predict on test and training data

predict_test_lm = lm.predict(features_test)
predict_test_lasso = lm_lasso.predict(features_test)
predict_test_ridge = lm_ridge.predict(features_test)
predict_test_elastic = lm_elastic.predict(features_test)

#Print the Loss Funtion - MSE & MAE

import numpy as np
from sklearn import metrics
def prediction_ridge (X_train, Y_train, X_test, Y_test,alpha,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = Ridge (alpha = alpha,normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("Fit a model X_train, and calculate MSE with Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("Fit a model X_train, and calculate MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="Ridge %.3f  " %alpha
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result
Example #38
0

def load_extended_boston():
    boston = load_boston()
    X = MinMaxScaler().fit_transform(boston.data)
    X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
    return X, boston.target


X, y = load_extended_boston()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge.score(X_test, y_test)))

ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge10.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge10.score(X_test, y_test)))

ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge01.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge01.score(X_test, y_test)))

lr = LinearRegression().fit(X_train, y_train)

plt.plot(ridge10.coef_, '^', label="Ridge alpha=10")
plt.plot(ridge.coef_, 's', label="Ridge alpha=1")
plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1")
Example #39
0
from sklearn.linear_model import Ridge

ridge = Ridge()

ridge.fit(X_train, y_train)


# In[12]:

pred_test = ridge.predict(X_test)
pred_test


# In[13]:

ridge.score(X_test, y_test)


# In[14]:

#MSE

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)


# In[16]:

#RandomForestRegresion

from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(sal_munged, y, test_size=0.3)
x_train = x_train.reshape(-1, x_train.shape[1])

regr = Ridge().fit(x_train, y_train)


### MODEL PERFORMANCE ###

# The Mean Squared Error
print("Mean Squared Error, training data: %d" % np.mean((regr.predict(x_train) - y_train) ** 2))
print("Mean Squared Error, test data: %d" % np.mean((regr.predict(x_test) - y_test) ** 2))
print(30 * "* ")

# Variance score
print("Variance score, training data: %.2f" % regr.score(x_train, y_train))
print("Variance score, test data: %.2f" % regr.score(x_test, y_test))
print(30 * "* ")

### GRAPHS: DISTRIBUTION OF ERROR ###
print("Distribution of prediction error on training data:")
predError = regr.predict(x_train) - y_train
plt.hist(predError)
plt.xlim(-80000, 80000)
plt.show()

print("Distribution of prediction error on test data:")
predError = regr.predict(x_test) - y_test
plt.hist(predError)
plt.xlim(-80000, 80000)
plt.show()
Example #41
0
    def TrainModel(self):
        self.browser.clear()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test
        X_train1, X_test1, y_train1, y_test1 = X_train.values, X_test.values, y_train.values, y_test.values

        y_train2 = y_train1.reshape(-1, 1)
        y_test2 = y_test1.reshape(-1, 1)

        scalerX = preprocessing.StandardScaler().fit(X_train1)
        scalery = preprocessing.StandardScaler().fit(y_train2)

        X_train3 = scalerX.transform(X_train1)
        X_test3 = scalerX.transform(X_test1)
        y_train3 = scalery.transform(y_train2)
        y_test3 = scalery.transform(y_test2)

        self.browser.append("Load Dataset")
        self.browser.append("")
        self.browser.append("")

        # LinearRegression Model
        lm = LinearRegression()
        lm.fit(X_train, y_train)
        y_pred_lm = lm.predict(X_test)
        acc_lm_train = round(lm.score(X_train, y_train) * 100, 2)
        acc_lm_test = round(lm.score(X_test, y_test) * 100, 2)
        self.browser.append("<LinearRegression Model>")
        self.browser.append("Train acc : " + str(acc_lm_train) + "%")
        self.browser.append("Test acc : " + str(acc_lm_test) + "%")
        self.browser.append("")
        #time.sleep(3)

        # Ridge Regression Model
        ridge = Ridge(alpha=0.1)
        ridge.fit(X_train, y_train)
        y_pred_ridge = ridge.predict(X_test)
        acc_ridge_train = round(ridge.score(X_train, y_train) * 100, 2)
        acc_ridge_test = round(ridge.score(X_test, y_test) * 100, 2)
        self.browser.append("<Ridge Regression Model>")
        self.browser.append("Train acc : " + str(acc_ridge_train) + "%")
        self.browser.append("Test acc : " + str(acc_ridge_test) + "%")
        self.browser.append("Used Coefficient : " +
                            str(np.sum(ridge.coef_ != 0)))
        self.browser.append("")
        #time.sleep(3)

        # Lasso Regression Model
        lasso = Lasso(alpha=0.1, max_iter=100000)
        lasso.fit(X_train, y_train)
        y_pred_lasso = lasso.predict(X_test)
        acc_lasso_train = round(lasso.score(X_train, y_train) * 100, 2)
        acc_lasso_test = round(lasso.score(X_test, y_test) * 100, 2)
        self.browser.append("<Lasso Regression Model>")
        self.browser.append("Train acc : " + str(acc_lasso_train) + "%")
        self.browser.append("Test acc : " + str(acc_lasso_test) + "%")
        self.browser.append("Used Coefficient : " +
                            str(np.sum(lasso.coef_ != 0)))
        self.browser.append("")

        # SGD Regression
        sgd = SGDRegressor(loss="squared_loss",
                           penalty=None,
                           random_state=42,
                           max_iter=100000)
        sgd.fit(X_train3, y_train3)
        y_pred_sgd = sgd.predict(X_test3)
        acc_sgd_train = round(sgd.score(X_train3, y_train3) * 100, 2)
        acc_sgd_test = round(sgd.score(X_test3, y_test3) * 100, 2)
        self.browser.append("<Stochastic Gradient Descent Regression>")
        self.browser.append("Train acc : " + str(acc_sgd_train) + "%")
        self.browser.append("Test acc : " + str(acc_sgd_test) + "%")
        self.browser.append("")

        # Decision Tree's
        etr = ExtraTreesRegressor()
        etr.fit(X_train, y_train)
        y_pred_etr = etr.predict(X_test)
        acc_etr_train = round(etr.score(X_train, y_train) * 100, 2)
        acc_etr_test = round(etr.score(X_test, y_test) * 100, 2)
        self.browser.append("<Extra Trees Regressor(Random Forest)>")
        self.browser.append("Train acc : " + str(acc_etr_train) + "%")
        self.browser.append("Test acc : " + str(acc_etr_test) + "%")
        self.browser.append("")

        #SVR
        svr = SVR()
        svr.fit(X_train3, y_train3)
        y_pred_svr = svr.predict(X_test3)
        acc_svr_train = round(svr.score(X_train3, y_train3) * 100, 2)
        acc_svr_test = round(svr.score(X_test3, y_test3) * 100, 2)
        self.browser.append("<Support Vector Machine>")
        self.browser.append("Train acc : " + str(acc_svr_train) + "%")
        self.browser.append("Test acc : " + str(acc_svr_test) + "%")
        self.browser.append("")

        models = pd.DataFrame({
            'Model': [
                'LinearRegression', 'Ridge Regression', 'Lasso Regression',
                'SGD Regression', 'Extra Trees Regressor',
                'Support Vector Machine'
            ],
            'Score': [
                acc_lm_test, acc_ridge_test, acc_lasso_test, acc_sgd_test,
                acc_etr_test, acc_svr_test
            ]
        })

        models.sort_values(by='Score', ascending=True)
        models = PandasModelTrainData(models)
        self.tableView = QTableView()
        self.tableView.setSortingEnabled(True)
        self.tableView.setModel(models)
        self.tableView.setGeometry(850, 100, 320, 400)
        self.tableView.setColumnWidth(0, 200)
        self.tableView.sortByColumn(1, Qt.DescendingOrder)
        self.tableView.setWindowTitle("Accuracy")
        self.tableView.show()
def ridge_reg(X,Y,data_file,p=False):
    """
    Does ridge regression on the data provided

    Inputs
    ------
    X :         Coulumns of the pandas dataframe that contains the data for each
                of the descriptors to be used

    Y :         Column of the pandas dataframe that contains the values to be
                predicted

    data_file : String containing the name of the file the model statistics will
                be stored in, where the RMSE and R-Squared values for each model
                will be stored

    Outputs
    -------
    coefs :     Contains a list of the coefficient for each descriptor used
    """

    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=3)

    high_score = 0
    alpha_ = 0
    #coefs = np.zeros(19)

    rr0001 = Ridge(alpha=0.001)
    rr0001.fit(X_train, y_train)
    Ridge_train_score0001 = rr0001.score(X_train,y_train)
    Ridge_test_score0001 = rr0001.score(X_test, y_test)
    high_score = Ridge_test_score0001
    alpha_ = 0.001
    coefs = rr0001.coef_
    pred = rr0001.predict(X_test)
    rmse = np.sqrt(MSE(y_test, pred))

    rr001 = Ridge(alpha=0.01)
    rr001.fit(X_train, y_train)
    Ridge_train_score001 = rr001.score(X_train,y_train)
    Ridge_test_score001 = rr001.score(X_test, y_test)
    if(Ridge_test_score001 > high_score):
        high_score = Ridge_test_score001
        alpha_ = 0.01
        coefs = rr001.coef_
        pred = rr001.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr01 = Ridge(alpha=0.1)
    rr01.fit(X_train, y_train)
    Ridge_train_score01 = rr01.score(X_train,y_train)
    Ridge_test_score01 = rr01.score(X_test, y_test)
    if(Ridge_test_score01 > high_score):
        high_score = Ridge_test_score01
        alpha_ = 0.1
        coefs = rr01.coef_
        pred = rr01.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr10 = Ridge(alpha=10)
    rr10.fit(X_train, y_train)
    Ridge_train_score10 = rr10.score(X_train,y_train)
    Ridge_test_score10 = rr10.score(X_test, y_test)
    if(Ridge_test_score10 > high_score):
        high_score = Ridge_test_score10
        alpha_ = 10
        coefs = rr10.coef_
        pred = rr10.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr100 = Ridge(alpha=100)
    rr100.fit(X_train, y_train)
    Ridge_train_score100 = rr100.score(X_train,y_train)
    Ridge_test_score100 = rr100.score(X_test, y_test)
    if(Ridge_test_score100 > high_score):
        high_score = Ridge_test_score100
        alpha_ = 100
        coefs = rr100.coef_
        pred = rr100.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr1000 = Ridge(alpha=1000)
    rr1000.fit(X_train, y_train)
    Ridge_train_score1000 = rr1000.score(X_train,y_train)
    Ridge_test_score1000 = rr1000.score(X_test, y_test)
    if(Ridge_test_score1000 > high_score):
        high_score = Ridge_test_score1000
        alpha_ = 1000
        coefs = rr1000.coef_
        pred = rr1000.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    data_file.write('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score))
    data_file.write('\n\t\t\tRMSE: \t\t%f' % (rmse))

    if(p==True):
        print('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score))
        print('\n\t\tRMSE: \t\t%f' % (rmse))

    return np.concatenate((rr001.coef_, rr10.coef_, rr100.coef_, rr1000.coef_), axis=0), np.array(coefs)
Example #43
0
    X1 = X_train_reduced[train]
    Y1 = Y_train_raw[train]

    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]

    ## Train Classifiers on fold
    rdg_clf = Ridge(alpha=0.5)
    rdg_clf.fit(X1, Y1)
    lso_clf = Lasso(alpha=0.6257)
    lso_clf.fit(X1, Y1)
    svr_clf = LinearSVR(C=1e3)
    svr_clf.fit(X1, Y1)

    ## Score Classifiers on fold
    rdg_clf_score = rdg_clf.score(X2, Y2)
    lso_clf_score = lso_clf.score(X2, Y2)
    svr_clf_score = svr_clf.score(X2, Y2)

    print "Ridge:  ", rdg_clf_score
    print "Lasso:  ", lso_clf_score
    print "SVR_RBF:  ", svr_clf_score


## Train final Classifiers
# clf = Ridge(alpha=.5)
clf = LinearSVR(C=1e3, gamma=0.1)
clf.fit(X_train_reduced, Y_train_raw)
Y_predicted = clf.predict(X_test_reduced)

## Save results to csv
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

train = train.fillna(0.0)
test = test.fillna(0.0)

x_train = df
y_train = x_train.nota.values
del x_train['nota']


hyperparams = {'alpha':[0.0005, 0.0014, 0.0006, 0.00061, 0.000612, 0.000613001, 0.000614, 0.00061401, 0.00061402, 0.00061403, 0.0006104 ]}
gs = GridSearchCV(estimator=Ridge(normalize=True), param_grid=hyperparams)
gs.fit(x_train, y_train)
pred = pd.Series(gs.predict(test))
err = gs.score(x_train, y_train)
print('Result:')
print('Best parameter: ',gs.best_params_)
print('Best score: ',gs.best_score_)
print('Root mean square logarithmic error: ', err)
print('\n')

ridge2 = Ridge(alpha = 0.0005, normalize=True)	
ridge2.fit(x_train, y_train)
print(ridge2.score(x_train, y_train))
result = pd.DataFrame(ridge2.predict(test), index = test.index, columns=['nota'])
print result
#result = result.drop_duplicates(subset='atleta_id', keep="last")
#result['atleta_id'] = result['atleta_id'].apply(lambda x:str(x))
result.to_csv('submission.csv')

#################
#Regularization
##################

#Ridge regression (L2) Penalty (alpha Regularization Parameter)
#Ridge Regression leads to dense solutions, in which most coefficients are non-zero

from sklearn.linear_model import Ridge
ridge_models = {}
training_scores = []
test_scores = []

for alpha in [100, 10, 1, .01]:
    ridge = Ridge(alpha=alpha).fit(X_train, y_train)
    training_scores.append(ridge.score(X_train, y_train))
    test_scores.append(ridge.score(X_test, y_test))
    ridge_models[alpha] = ridge

plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [100, 10, 1, .01])
plt.legend(loc="best")


#Lasso (L1) Penalty (alpha Regularization Parameter)
#LASSO leads to sparse solutions, driving most coefficients to zero
from sklearn.linear_model import Lasso

lasso_models = {}
training_scores = []
Example #46
0
# # errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# # -------------------------------------

# --- RIDGE REGRESSION ---  #
boston_rr = Ridge()
boston_rr.fit(X_train, y_train)
print("Coefficients: ", boston_rr.coef_)
print("Intercept: ", boston_rr.intercept_)

# R for train and test set
print('R2 for train: ', boston_rr.score(X_train, y_train))
print('R2 for test: ', boston_rr.score(X_test, y_test))

# ridge regression - prediction
y_pred = boston_rr.predict(X_test)
df = pd.DataFrame({'actual': y_test, 'pred': y_pred})
print(df)

print(pd.DataFrame(boston_rr.coef_))

# errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# -------------------------------------
Example #47
0
np.random.seed(42)
x = np.linspace(0, 20, 21)
y = 5 * x + 2 + np.random.normal(0.0, 20.0, 21)

# Hint: if you get a shape error from scikit, try:
X = x.reshape(21, 1)

poly = PolynomialFeatures(30)
Xpoly = poly.fit_transform(X)
Xscaled = MinMaxScaler().fit_transform(Xpoly)

Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, random_state=42)

m = Ridge(alpha=0.1)
m.fit(Xtrain, ytrain)
m.score(Xtrain, ytrain)
m.score(Xtest, ytest)
ypred = m.predict(Xscaled)

plt.bar(range(31), m.coef_)

plt.plot(Xtrain[:, 1], ytrain, 'bo')
plt.plot(Xtest[:, 1], ytest, 'kx')
plt.plot(Xscaled[:, 1], ypred, 'r-')
plt.axis([0.0, 1.0, 0.0, 140.0])

plt.plot(Xtrain[:, 1], ytrain, 'bo')
plt.plot(Xtest[:, 1], ytest, 'kx')
plt.plot(x, y, 'bo')
plt.plot(x, ypred, 'r-')
plt.axis([2.0, 20.0, 20.0, 140.0])
print("accuracy",ac1)
y_pred1=model2.predict(X_test)
print("prediction",y_pred1)
#VISIULIZATION
plt.scatter(x1,y1,color='red')
plt.plot(x1,model2.predict(pol_reg.fit_transform(x1)),color='blue')
plt.tittle("Truth or bbluff (linear regression)")
plt.xlabel("squarfit_living")
plt.ylabel("price")
plt.show()
#-------------Above model is overfitted--------------------

#to avoid over fitting ridge regression require
#apply ridge regression
from sklearn.linear_model import Ridge
ridmodel=Ridge(alpha=0.000000000000005,normalize=True)
ridmodel.fit(X_train,y_train)
rid_pre=ridmodel.predict(X_test)
print(rid_pre)
ac2=ridmodel.score(X_test,y_test)
print("accuracy",ac2)
#Data visiulization
plt.scatter(x1,y1,color='red')
plt.plot(x1,ridmodel.predict(pol_reg.fit_transform(x1)),color='blue')
plt.tittle("Truth or bbluff (linear regression)")
plt.xlabel("squarfit_living")
plt.ylabel("price")
plt.show()


Example #49
0
train_x,text_x,train_y,text_y = cross_validation.train_test_split(X1,y1,train_size=0.5,random_state=1)

#f_fold = StratifiedKFold(y=y1,n_folds=10,random_state=1)
f_fold = KFold(len(y1),n_folds=10,random_state=0)

score = []
mean_square_score_train= []
mean_square_score_test = []
r2_score_train = []
r2_score_test = []

train_stuff=[]
test_stuff=[]
for k, (train,text) in enumerate(f_fold):
    predictor.fit(X1[train],y1[train])
    c = predictor.score(X1[text],y1[text])
    score.append(c)
    mean_square_score_train.append(mean_squared_error(y1[train],predictor.predict(X1[train])))
    mean_square_score_test.append(mean_squared_error(y1[text],predictor.predict(X1[text])))
    r2_score_train.append(r2_score(y1[train],predictor.predict(X1[train])))
    r2_score_test.append(r2_score(y1[text],predictor.predict(X1[text])))
    print "percentage within 7 days error for training data  " + str(
        sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100)
    print "percentage within 7 days error for testing data   " + str(
        sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100)
    print "-------------------"
    train_stuff.append(sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100)
    test_stuff.append(sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100)


Example #50
0
#model generation and prediction
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#Linear Regression
clfreg = LinearRegression(n_jobs=1)
clfreg.fit(X_train,y_train)
y_pred = clfreg.predict(X_test)
confidencereg = clfreg.score(X_test,y_test)

#Ridge Regression
rr = Ridge(alpha=0.01)
rr.fit(X_train,y_train)
y_pred_ridge = rr.predict(X_test)
confidenceridge = rr.score(X_test,y_test)

#Lasso Regression
ls = Lasso()
ls.fit(X_train,y_train)
y_pred_lasso = ls.predict(X_test)
confidencelasso = ls.score(X_test,y_test)

#plotting learning curves for linear regression
import matplotlib.pyplot as plt
plt.plot(y_test[:100])
plt.plot(y_pred[:100])
plt.legend(['Actual', 'Linear Predicted'], loc='upper right')
plt.show()

 def RidgReg(self):
     r=Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)
     r=r.fit(self.exec_data_X,self.exec_data_Y)
     print("Score for Ridge Regression",end=" ")
     print(r.score(self.exec_data_X,self.exec_data_Y))
                           param_grid=parameters,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)
grid_search = grid_search.fit(X_poly[:, 1:], y_train)
best_mse = grid_search.best_score_
best_parameters = grid_search.best_params_

ridgeReg = Ridge(fit_intercept=False,
                 normalize=True,
                 alpha=0.01,
                 tol=1e-5,
                 max_iter=13000,
                 solver='auto')
ridgeReg.fit(X_poly, y_train)
y_pred = ridgeReg.predict(X_poly)
sums = (y_pred - y_train)**2
sums = (np.sum(sums)) / len(y_pred)
score = ridgeReg.score(X_poly, y_train)
print(f'Training error {round(sums * (10**3),3) }')
print(f'Traning Score {round(score,3)} \n')

prediction = cross_val_predict(ridgeReg, X_poly, y_train, cv=5)
sums = (prediction - y_train)**2
sums = (np.sum(sums)) / len(prediction)
accuracies = cross_val_score(estimator=ridgeReg, X=X_poly, y=y_train, cv=5)

print(f'Validation error {round(sums * (10**3),3) }')
print(f'Validation Score {round(accuracies.mean(),3)} \n')
Example #53
0
def main():
    usage = 'usage: %prog [options] <fasta> <scores>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='canonical_kmers', default=False, action='store_true', help='Count canonical k-mers [Default: %default]')
    parser.add_option('--alpha', dest='alpha', default=None, type='float', help='Regularization alpha parameter. Will choose via CV if not specified [Default: %default]')
    parser.add_option('-c', dest='cv_folds', default=0, type='int', help='Cross-validate with this many folds [Default: %default]')
    parser.add_option('--epsilon', dest='epsilon', default=None, type='float', help='Regularization epsilon parameter. Will choose via CV if not specified [Default: %default]')
    parser.add_option('-g', dest='gaps', default=0, type='int', help='Gaps in k-mers string kernel [Default: %default]')
    parser.add_option('-k', dest='k', default=4, type='int', help='K-mer size for string kernel [Default: %default]')
    parser.add_option('-l', dest='length', default=False, action='store_true', help='Add log2 sequence length as an attribute [Default: %default]')
    parser.add_option('-m', dest='method', default='ols', help='Regression method [Default: %default]')
    parser.add_option('-o', dest='output_file', default='seq_regr.txt', help='Output file [Default: %default]')
    parser.add_option('-w', dest='whiten', default=False, action='store_true', help='Whiten the sequence scores [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide fasta file and scores file')
    else:
        fasta_file = args[0]
        scores_file = args[1]

    ##################################################
    # convert sequences to feature representations
    ##################################################
    seq_vectors = fasta_string_kernel(fasta_file, options.k, options.gaps, options.canonical_kmers)

    if options.length:
        add_length_feature(seq_vectors, fasta_file)

    ##################################################
    # read scores
    ##################################################
    seq_scores = {}

    scores_in = open(scores_file)

    try:
        line = scores_in.readline()
        a = line.split()
        seq_scores[a[0]] = float(a[1])
    except:
        # possible header line
        pass

    for line in scores_in:
        a = line.split()
        seq_scores[a[0]] = float(a[1])


    ##################################################
    # make scikit-learn data structures
    ##################################################
    # shitty method filling in the dense matrix
    kmers = set()
    for kmer_vec in seq_vectors.values():
        kmers |= set(kmer_vec.keys())

    kmers_sort = sorted(kmers)

    seq_headers = sorted(seq_vectors.keys())

    X = np.array([[seq_vectors[header].get(kmer,0) for kmer in kmers_sort] for header in seq_headers])
    y = np.array([seq_scores[header] for header in seq_headers])

    if options.whiten:
        y = preprocessing.scale(y)

    ##################################################
    # decide method
    ##################################################
    if options.method.lower() == 'ols':
        model = LinearRegression()

    elif options.method.lower() == 'pls':
        model = PLSRegression(n_components=2)

    elif options.method.lower() == 'ridge':
        if options.alpha:
            # model = Ridge(alpha=options.alpha)
            model = RidgeCV(alphas=[options.alpha], store_cv_values=True)
        else:
            #model = RidgeCV(alphas=[0.0001, 0.0002, 0.0004, 0.0008, .0016, 0.0032, 0.0064, .0128], store_cv_values=True)
            model = RidgeCV(alphas=[0.0004, 0.0008, 0.0016, 0.0032], store_cv_values=True)

    elif options.method.lower() == 'svm':
        if options.alpha:
            svm_c = len(y) / options.alpha
        else:
            svm_c = 100
        if options.epsilon:
            svm_eps = options.epsilon
        else:
            svm_eps = 0.5

        model = SVR(kernel='linear', degree=3, C=svm_c, epsilon=svm_eps)

    elif options.method.lower() == 'gp':
        model = GaussianProcess()

    else:
        print >> sys.stderr, 'Method not recognized.'
        exit(1)


    ##################################################
    # learn model
    ##################################################
    model.fit(X, y)

    ss_tot = sum(np.square(y - np.mean(y)))

    if options.method.lower() == 'ridge':
        for i in range(len(model.alphas)):
            score_cv = (1.0 - sum(model.cv_values_[:,i])/ss_tot)
            print >> sys.stderr, 'RidgeCV alpha=%.5f score=%f' % (model.alphas[i], score_cv)

    ##################################################
    # cross-validate
    ##################################################
    if options.cv_folds > 0:
        scores = []
        ss_reg = 0

        if options.method.lower() == 'ridge':
            model_cv = Ridge(alpha=model.alpha_)
        else:
            model_cv = copy.copy(model)

        kf = KFold(len(y), n_folds=options.cv_folds, shuffle=True)
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

            # learn on train
            model_cv.fit(X[train], y[train])

            # score on test
            scores.append(model_cv.score(X_test, y_test))

            ss_reg += sum(np.square(y_test - model_cv.predict(X_test)))

        score_cv = 1 - ss_reg / ss_tot


    ##################################################
    # output model information
    ##################################################
    model_out = open(options.output_file, 'w')

    print('Score\t%.3f' % model.score(X, y), file=model_out)
    if options.cv_folds > 0:
        print >> model_out, 'ScoreCV\t%.3f' % score_cv
        if options.method.lower() == 'ridge' and options.alpha:
            score_cv = (1.0 - sum(model.cv_values_)/ss_tot)
            print('ScoreCV\t%.3f' % score_cv, file=model_out)

    for i in range(len(kmers_sort)):
        if options.method.lower() == 'pls':
            coef_i = model.coefs[i]
        else:
            coef_i = model.coef_[i]

        print('%s\t%f' % (kmers_sort[i], coef_i), file=model_out)

    model_out.close()
import mglearn
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


X, y = mglearn.datasets.load_extended_boston()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LinearRegression().fit(X_train, y_train)

ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

# Training set score: 0.89
# Test set score: 0.75

ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))
# Training set score: 0.79
# Test set score: 0.64

ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test)))
# Training set score: 0.93
Example #55
0
X=Salesdf[['perishable', 'item_nbr', 'store_nbr', 'cluster']]
y=Salesdf[["unit_sales"]]

reg=linear_model.LinearRegression()
cv_results=cross_val_score(reg,X_train,y_train,cv=5)
print(cv_results)
print(np.mean(cv_results))
print(np.std(cv_results))
#Using cross validation of score 5


ridge = Ridge(alpha=0.1, normalize = True)
ridge.fit(X_train,y_train)
ridge_pred=ridge.predict(X_test)
ridge.score(X_test,y_test)
#The score is pretty much similar to the linear model built which ensures that the model has passed the Ridge regression test
# for regularization
#Ridge is used to penalize the loss function by adding the OLS loss function to the square of each coefficient multiplied by alpha.    











print("number of test samples:", x_test.shape[0])
print("number of training samples:", x_train.shape[0])

# ### Question 9
# Create and fit a Ridge regression object using the training data, set the regularization parameter to 0.1, and calculate the R^2 using the test data.
#

# In[26]:

from sklearn.linear_model import Ridge

# In[27]:

RigeModel = Ridge(alpha=0.1)
RigeModel.fit(x_train, y_train)
RigeModel.score(x_test, y_test)

# ### Question 10
# Perform a second order polynomial transform on both the training data and testing data. Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2.

# In[28]:

pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train[features])
x_test_pr = pr.fit_transform(x_test[features])

RigeModel = Ridge(alpha=0.1)
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)

# <p>Once you complete your notebook you will have to share it. Select the icon on the top right a marked in red in the image below, a dialogue box should open, and select the option all&nbsp;content excluding sensitive code cells.</p>
Example #57
0
print("Minimum Error for Ridge Model: ", minimum_error)
print("Minimum Error for Lasso Model: ", minimum_error_lasso)
def ord_to_char(v, p=None):
    return chr(int(v))
    
#Picking up Ridge Model & figuring 10 most useful and 10 least useful parameters for Housing Price Prediction
ridgeReg = Ridge()
ridgeReg.fit(X,Y)
coef = pd.Series(ridgeReg.coef_, index = X.columns)
relevant_Coeff = coef.sort_values().tail(10)
irrelevant_Coeff = coef.sort_values().head(10)

#Plots
plt.figure(figsize=(20,10))
relevant_Coeff.plot(kind = "barh", title="Most Relevant Aspects of a House")

plt.figure(figsize=(20,10))
irrelevant_Coeff.plot(kind = 'barh', title="Least Relevant Aspects of a House")

#Remaining Feature Set
plt.figure(figsize= (50,10))
preds = pd.DataFrame({"Predicted":ridgeReg.predict(X), "true":Y})
preds["Difference"] = preds["true"] - preds["Predicted"]
preds.plot(x = "Predicted", y = "Difference",kind = "scatter", title = "Residual Features")

print (ridgeReg.score(X,Y))

preds = np.expm1(ridgeReg.predict(X_test)) #Exponential function used to balance out log(x + 1) 
solution = pd.DataFrame({"id":test_DF.Id, "SalePrice":preds})
solution.to_csv("ridge_sol.csv", index = False)
Example #58
0
    X_final, Y_final = select_Y(final, 19)
    X_final = select_atributes(X_final, vektors[10])
    X_test, Y_test = select_Y(test, 19)
    X_test = select_atributes(X_test, vektors[10])
    poly = preprocessing.PolynomialFeatures(2)
    X_final = poly.fit_transform(X_final)
    X_test = poly.fit_transform(X_test)

    scaler = StandardScaler()
    scaler = scaler.fit(X_final)
    X_final = scaler.transform(X_final)
    X_test = scaler.transform(X_test)

    trained = RDG.fit(X_final, Y_final)
    Y_predict = RDG.predict(X_test)
    print(RDG.score(X_final, Y_final))
    print(RDG.score(X_test, Y_test))
    Y_mean = np.mean(Y_final)
    r2 = mtrcs.r2_score(Y_test, Y_predict)
    mae = mtrcs.mean_absolute_error(Y_test, Y_predict)
    mse = mtrcs.mean_squared_error(Y_test, Y_predict)
    mae_predict = np.mean(np.abs(Y_test - Y_mean))
    mse_predict = np.mean(np.power(np.abs(Y_test - Y_mean), 2))
    msg = "%20s: %10f %10f %10f %10f %10f %10f %10f" % (
        "Testing results r2score,MAE,MSE,MAE diff", r2, mae, mse, mae_predict,
        mse_predict, mae_predict - mae, mse_predict - mse)
    print(msg)
    X_plot = select_atributes(final, vektor)
    for i in range(len(header)):
        print(header[i])
        print(RDG.coef_[0, i + 1])
# train[:,1:] = log10(nbaData[:,1:])

regression = Ridge(alpha=0.05)



kf = KFold(len(train),k=10)
avgResiduSum = 0
avgVar = 0
for tr, e in kf:
    regression.fit(train[tr,1:],train[tr,0])

    avgResiduSum += mean((regression.predict(train[e,1:]) - train[e,0]) ** 2)

    # Explained variance score: 1 is perfect prediction
    avgVar +=regression.score(train[e,1:] , train[e,0])

print '############'
print 'Evaluation Phase'
avgResiduSum = avgResiduSum/len(kf)
print("Average Residual sum of squares: %.2f" % avgResiduSum )
avgVar = avgVar/len(kf)
print('Average Variance score: %.2f' % avgVar)

print '############'
print 'Testing Phase'
regression.fit(train[:,1:],train[:,0])
print("Residual sum of squares: %.2f"
      % mean((regression.predict(nba15test_scaled[:,1:]) - nba15test_scaled[:,0]) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regression.score(nba15test_scaled[:,1:] , nba15test_scaled[:,0]))
X = df[features]
Y = df['price']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)


print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

#Question 9
#Create and fit a Ridge regression object using the training data,
#set the regularization parameter to 0.1, and calculate the R^2 using the test data.
from sklearn.linear_model import Ridge

RidgeModel = Ridge(alpha = 0.1)
RidgeModel.fit(x_train, y_train)
RidgeModel.score(x_test, y_test)

#Question 10
#Perform a second order polynomial transform on both the training data and testing data. 
#Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, 
#and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2.

SecondOrderPolynomialTransform = PolynomialFeatures(degree=2)
x_train_transformed = SecondOrderPolynomialTransform.fit_transform(x_train)
x_test_transformed = SecondOrderPolynomialTransform.fit_transform(x_test)

NewRidgeModel = Ridge(alpha = 0.1)
NewRidgeModel.fit(x_train_transformed, y_train)
NewRidgeModel.score(x_test_transformed, y_test)