コード例 #1
0
ファイル: models.py プロジェクト: nhu2000/PriceMyRental
def ridge_regressor(df):
    """
    INPUT: Pandas dataframe
    OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients
    """
    y = df.pop("price").values
    X = df.values
    feature_names = df.columns
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

    clf = Ridge(alpha=1.0)
    clf.fit(xtrain, ytrain)

    score = clf.score(xtest, ytest)
    feat_imps = clf.coef_
    ypredict = clf.predict(xtest)
    mae = np.mean(np.absolute(ytest - ypredict))
    mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest)
    return (
        "R^2 is ",
        score,
        "RMSE is ",
        rmse,
        "MAE percent is ",
        mae_percent,
        "Feature coefficients are ",
        zip(feature_names, feat_imps),
    )
コード例 #2
0
def compute_linear_model(mfs, measures):
    from sklearn.linear_model import Ridge
    from sklearn import linear_model

    # try different ones
    clf = Ridge(alpha = 1.0)
    #clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
    #clf = linear_model.LinearRegression()

    # explain fexp using BMD + the MFS data
    fexp = measures[:, measures.shape[1]-1]

    bmd = measures[:, 0]
    bmd = bmd.reshape((bmd.shape[0], 1))

    #print "BMD: ", bmd
    #print "FEXP: ", fexp
    #print "MFS; ", mfs

    #PCA
    #from sklearn.decomposition import PCA
    #pca = PCA(n_components=12)
    #pca.fit(mfs)
    #mfs_pca = pca.transform(mfs)

    X = np.hstack((bmd, mfs))
    clf.fit(X, fexp)

    # Results
    #print "Coefs:", clf.coef_
    print "Score (R^2):", clf.score(X, fexp)
コード例 #3
0
ファイル: enetTests.py プロジェクト: abbylyons/181practicals
def ridgereg(a):
    print("Doing ridge regression")
    clf = Ridge(alpha=a)
    clf.fit(base_X, base_Y)
    print ("Score = %f" % clf.score(base_X, base_Y))
    clf_pred = clf.predict(X_test)
    write_to_file("ridge.csv", clf_pred)
コード例 #4
0
ファイル: leonardi_TP4.py プロジェクト: laiaga/TPSM1
def comparaison_ridge_lasso(X,Y):
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
    clf_lasso = Lasso(selection='random', random_state=random.seed())
    clf_ridge = Ridge()
    clf_lasso.fit(X_train,Y_train)
    clf_ridge.fit(X_train,Y_train)
    score_lasso=clf_lasso.score(X_test,Y_test)
    score_ridge=clf_ridge.score(X_test,Y_test)
    print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
コード例 #5
0
def test_huber_better_r2_score():
    # Test that huber returns a better r2 score than non-outliers"""
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100)
    huber.fit(X, y)
    linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
    mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
    huber_score = huber.score(X[mask], y[mask])
    huber_outlier_score = huber.score(X[~mask], y[~mask])

    # The Ridge regressor should be influenced by the outliers and hence
    # give a worse score on the non-outliers as compared to the huber regressor.
    ridge = Ridge(fit_intercept=True, alpha=0.01)
    ridge.fit(X, y)
    ridge_score = ridge.score(X[mask], y[mask])
    ridge_outlier_score = ridge.score(X[~mask], y[~mask])
    assert_greater(huber_score, ridge_score)

    # The huber model should also fit poorly on the outliers.
    assert_greater(ridge_outlier_score, huber_outlier_score)
コード例 #6
0
ファイル: parkinson.py プロジェクト: Matafight/MatPyUtil
def training_predict_ridge(df):

    results =[]
    #独立重复10次
    for train,test in KFold(len(df),n_folds = 10,shuffle = True):
        para = process_ridge(df.T[train].T)
        clf = Ridge(alpha = para)
        clf.fit(df[predictors].T[train].T,df[target1].T[train].values.ravel())

        sc = clf.score(df[predictors].T[test].T,df[target1].T[test].values.ravel())
        results.append(sc)
    return results
コード例 #7
0
ファイル: leonardi_TP4.py プロジェクト: laiaga/TPSM1
def test_alpha_opti(X,Y,nb_tests):
    score_lasso=0
    score_ridge=0
    score_lasso_opti=0
    score_ridge_opti=0
    for i in range(0,nb_tests):
        X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed())
        clf_lasso = Lasso(selection='random', random_state=random.seed())
        clf_ridge = Ridge()
        clf_lasso.fit(X_train,Y_train)
        clf_ridge.fit(X_train,Y_train)
        score_lasso+=clf_lasso.score(X_test,Y_test)
        score_ridge+=clf_ridge.score(X_test,Y_test)
        clf_lasso_opti = Lasso(selection='random', random_state=random.seed(),alpha=0.1)
        clf_ridge_opti = Ridge(alpha=0.1)
        clf_lasso_opti.fit(X_train,Y_train)
        clf_ridge_opti.fit(X_train,Y_train)
        score_lasso_opti+=clf_lasso_opti.score(X_test,Y_test)
        score_ridge_opti+=clf_ridge_opti.score(X_test,Y_test)
    print("Lasso (opti - non-opti) : {:3.3f}%".format(100*(score_lasso_opti-score_lasso)/nb_tests))
    print("Ridge (opti - non-opti) : {:3.3f}%".format(100*(score_ridge_opti-score_ridge)/nb_tests))
コード例 #8
0
ファイル: analyze.py プロジェクト: nOkuda/reviewlabeljob
def _regression_surface(
        userdata,
        switch_indiceses,
        corpus,
        filename):
    """Analyze data and make plot of document position and length vs. labeling
    time.
    """
    doclengths = []
    positions = []
    times = []
    for user, data in userdata.items():
        curdoclengths = _get_doclengths_for_user(userdata, user, corpus)
        switch_indices = switch_indiceses[user]
        user_times = _build_data_times(user, data)
        for i in range(1, len(switch_indices)):
            if switch_indices[i] - switch_indices[i-1] == 16:
                doclengths.extend(
                    curdoclengths[switch_indices[i-1]:switch_indices[i]])
                positions.extend(np.arange(1, 17))
                times.extend(
                    user_times[switch_indices[i-1]:switch_indices[i]])
    doclengths = np.array(doclengths)
    positions = np.array(positions)
    times = np.array(times)
    model_inputs = np.stack((doclengths, positions), axis=-1)
    ridge_model = Ridge()
    ridge_model.fit(model_inputs, times)
    r2 = ridge_model.score(model_inputs, times)
    fig, axis = plt.subplots(1, 1)
    xdata = np.arange(1, 17)
    for doclength in [30, 50, 100, 200, 500, 1000]:
        inputs = np.stack((np.array([doclength]*len(xdata)), xdata), axis=-1)
        ydata = ridge_model.predict(inputs)
        axis.plot(
            xdata,
            ydata,
            linewidth=2,
            label=str(doclength))
        # apparently, all of the lines go down by 6.02762577314 from first
        # labeling time to 16th
        # axis.annotate(str(ydata[0] - ydata[-1]), (xdata[-1], ydata[-1]))
    box = axis.get_position()
    axis.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    legend = axis.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    legend.set_title('Document length (in tokens)')
    axis.set_title('$R^2=$'+str(r2))
    axis.set_xlabel('Document order')
    axis.set_ylabel('Time (seconds)')
    fig.savefig(filename, bbox_inches='tight')
コード例 #9
0
ファイル: model.py プロジェクト: andrew-wm-arthur/DragRace
def regress( X, y, iterations = 10 ):
    ridge_model = Ridge( alpha=.1).fit(X,y)
    print("within sample R^2: "+str(ridge_model.score(X,y)))
    print('\n')

    linear_scores = []
    kernel_scores = []
    for i in range(iterations):
        ( X_train,
          X_test,
          y_train,
          y_test 
        ) = cross_validation.train_test_split( X, y, random_state=randint(0,100))

        model = Ridge( alpha=10.0 )
        model.fit(X_train,y_train)
        linear_scores.append(model.score(X_test,y_test))

    print ( 'linear scores:\tmean = '+
            str(np.average(linear_scores))+
            '\tstd dev = '+
            str(np.std(linear_scores))
          )
コード例 #10
0
ファイル: test_sag.py プロジェクト: AlexisMignon/scikit-learn
def test_sag_regressor():
    """tests if the sag regressor performs well"""
    xmin, xmax = -5, 5
    n_samples = 20
    tol = .001
    max_iter = 20
    alpha = 0.1
    rng = np.random.RandomState(0)
    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)

    # simple linear function without noise
    y = 0.5 * X.ravel()

    clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter,
                 alpha=alpha * n_samples)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    score1 = clf1.score(X, y)
    score2 = clf2.score(X, y)
    assert_greater(score1, 0.99)
    assert_greater(score2, 0.99)

    # simple linear function with noise
    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()

    clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter,
                 alpha=alpha * n_samples)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    score1 = clf1.score(X, y)
    score2 = clf2.score(X, y)
    score2 = clf2.score(X, y)
    assert_greater(score1, 0.5)
    assert_greater(score2, 0.5)
コード例 #11
0
ファイル: model.py プロジェクト: jgershen/sportsball
def build_model(train_file, test_file, attr_file, model_out, predictions_out, algorithm='ridge'):
  classifiers = ['ridge', 'linear', 'lasso', 'rf', 'en']
  if algorithm not in classifiers:
    raise NotImplementedError("only implemented algorithms: " + str(classifiers))

  train_data = pd.read_pickle(train_file)
  attrs = read_attrs(attr_file)

  target_attr = attrs[0]
  usable_attrs = attrs[1:]

  if algorithm == 'ridge':
    clf = Ridge()
  elif algorithm == 'linear':
    clf = LinearRegression()
  elif algorithm == 'lasso':
    clf = Lasso()
  elif algorithm == 'en':
    clf = ElasticNet()
  else:
    clf = RandomForestRegressor()

  clf.fit(train_data[usable_attrs], train_data[target_attr])

  test_data = pd.read_pickle(test_file)
  predictions = clf.predict(test_data[usable_attrs])
  errors = predictions - test_data[target_attr]

  prediction_results = test_data[[target_attr] + usable_attrs].copy()
  prediction_results['predicted'] = predictions
  prediction_results.to_pickle(predictions_out)

  print "Modeling '%s'" % target_attr
  print "   Train:", train_file, '(%d examples)' % len(train_data)
  print "   Test:", test_file, '(%d examples)' % len(test_data)
  print "Algorithm:", algorithm

  if hasattr(clf, 'coef_'):
    print 'Coefficients:'
    for i,c in enumerate(clf.coef_):
      print '    %-20s' % usable_attrs[i] + ':', '%20.4f' % c

  print 'MSE  : %10.4f' % np.mean(errors ** 2)
  print 'medSE: %10.4f' % np.median(errors ** 2)
  print 'SSE  : %10.4f' % np.sum(errors ** 2)
  print 'Variance score: %.4f' % clf.score(test_data[usable_attrs], test_data[target_attr])

  pickle.dump(clf, open(model_out, 'wb'))
コード例 #12
0
ファイル: lime_base.py プロジェクト: marcotcr/lime
 def forward_selection(self, data, labels, weights, num_features):
     """Iteratively adds features to the model"""
     clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state)
     used_features = []
     for _ in range(min(num_features, data.shape[1])):
         max_ = -100000000
         best = 0
         for feature in range(data.shape[1]):
             if feature in used_features:
                 continue
             clf.fit(data[:, used_features + [feature]], labels,
                     sample_weight=weights)
             score = clf.score(data[:, used_features + [feature]],
                               labels,
                               sample_weight=weights)
             if score > max_:
                 best = feature
                 max_ = score
         used_features.append(best)
     return np.array(used_features)
コード例 #13
0
def run_full_example(df, ridge_alpha=1.0, test_set_fraction=0.5):
    
    #convert Pandas DataFrame to a feature matrix
    X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather'])

    #split into training and test sets
    Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=test_set_fraction)
    print '# of training samples: {}'.format(len(ytrain))
    print '# of test samples: {}'.format(len(ytest))
    print 'alpha: {:.2f}'.format(ridge_alpha)    
    print ''

    #create a Ridge object
    rr = Ridge(alpha=ridge_alpha)

    #fit the training data
    rr.fit(Xtrain, ytrain)

    #print out the weights and their names
    for weight,cname in zip(rr.coef_, col_names):
        print "{}: {:.6f}".format(cname, weight)
    print "Intercept: {:.6f}".format(rr.intercept_)
    print ''

    #compute the prediction on the test set
    ypred = rr.predict(Xtest)

    #compute the sum-of-squares error on the test set, which is
    #proportional to the log likelihood
    sqerr = np.sum((ytest - ypred)**2) / len(ytest)
    print 'Normalized Sum-of-squares Error: {:.3f}'.format(sqerr)

    #compute the sum-of-squares error for a model that is just
    #comprised of the mean on the training set
    sqerr_mean_only = np.sum((ytest - ytrain.mean())**2) / len(ytest)
    print 'Normalized Sum-of-squares Error for mean-only: {:.3f}'.format(sqerr_mean_only)

    #print out the R-squared on the test set
    r2 = rr.score(Xtest, ytest)
    print "R-squared: {:.2f}".format(r2)
    print ''    
コード例 #14
0
    def _random_search(self, random_iter, x, y):
        # Default Values
        alpha = 1.0
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {"alpha": uniform(loc=0.0001, scale=10-0.0001)}
            param_list = [{"alpha": alpha}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            for idx, d in enumerate(param_list):
                rr = Ridge(alpha=d["alpha"],
                           fit_intercept=True,
                           normalize=False,
                           copy_X=True,
                           max_iter=None,
                           tol=0.001,
                           solver='auto')

                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5,
                                     random_state=self._rng)
                rr.fit(train_x, train_y)
                sc = rr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    alpha = d['alpha']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using alpha: %f\n" % alpha)
        return alpha
コード例 #15
0
def apply_ridge( X_train, Y_train, alpha=None ):
    alphas = [ alpha ]
    if not alpha: alphas = [ x for x in sorted(set([ alpha, 0.1, 1.0/3.0, 1.0, 10.0/3.0, 10.0 ])) if x]
    ALPHA_VALS = {}
    for a in alphas:
        model = Ridge(alpha=a, 
                      fit_intercept=True, 
                      normalize=False, 
                      copy_X=True, 
                      max_iter=None, 
                      tol=0.001, 
                      solver='auto')
        # sample_weights = [ 1.0/float(len(Y)) for x in Y ]
        model.fit( X_train, Y_train )# , sample_weight=sample_weights)
        R2 = model.score(X_train, Y_train)
        L2 = dot(model.coef_,model.coef_)
        ALPHA_VALS [a ] = [ a, R2, L2, [x for x in model.coef_] ]
        print "ALPHA: %.2f \t R^2=%7.4f \t L2_NORM(THETA)=%10.2f \t THETA[1:N]=%s" % ( a, R2, L2, model.coef_ )
    # A = sorted([ ALPHA_VALS[x] for x in ALPHA_VALS [ a, R2, L2, model.coef_[:] ], key=lambda x: x[1], reversed=True )
    Theta = [ float( model.intercept_ ) , ]
    Theta.extend( [ float( x ) for x in model.coef_])
    ( model, Theta, J, SCORE ) = performance_analysis( model, Theta, X_train, Y_train, debug=1 )
    return ( model, Theta, J, SCORE )
コード例 #16
0
def example4():
    #generate the dataset 
    df = generate_solar_data(num_samples=1000)

    #convert Pandas DataFrame to a feature matrix
    X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather'])

    #split into training and test sets
    Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=0.5)

    #create a Ridge object
    rr = Ridge()

    #fit the training data
    rr.fit(Xtrain, ytrain)

    #print out the weights and their names
    for weight,cname in zip(rr.coef_, col_names):
        print "{}: {:.6f}".format(cname, weight)
    print "Intercept: {:.6f}".format(rr.intercept_)

    #print out the R-squared on the test set
    r2 = rr.score(Xtest, ytest)
    print "R-squared: {:.2f}".format(r2)
コード例 #17
0
print('Crime dataset')
print('linear model intercept: {}'.format(linreg.intercept_))
print('linear model coeff:\n{}'.format(linreg.coef_))
print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))

# ridge regression approach --------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0)

linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print('Crime dataset')
print('ridge regression linear model intercept: {}'.format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'.format(linridge.coef_))
print('R-squared score (training): {:.3f}'.format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0)))

# ridge regression with normalization approach --------------------------------

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print('Crime dataset')
コード例 #18
0
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

#Lets build a Linear regression on Boston dataset
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X,y=mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Linreg = LinearRegression()
lr = Linreg.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

# Ridge regression-------------------------------------------------------------
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train,y_train)
print('Training set score : {}'.format(ridge.score(X_train,y_train)))
print('Test set score : {}'.format(ridge.score(X_test,y_test)))

"""
The Ridge model makes a trade-off between the simplicity of the model (near-zero
coefficients) and its performance on the training set. How much importance the
model places on simplicity versus training set performance can be specified by the
user, using the alpha parameter. In the previous example, we used the default parameter
alpha=1.0. There is no reason why this will give us the best trade-off, though.
The optimum setting of alpha depends on the particular dataset we are using.
Increasing alpha forces coefficients to move more toward zero, which decreases
training set performance but might help generalization. For example
"""
ridge10=Ridge(alpha=10).fit(X_train,y_train)
print('Training set score : {}'.format(ridge10.score(X_train,y_train)))
print('Test set score : {}'.format(ridge10.score(X_test,y_test)))
y = np.array(y_list)

# =============================================================================

# PERFORM ML PREDICTION

np.random.seed(0)
split_idxs = np.random.permutation(len(X))

# Split Data (Training Testing)
X_train = X[split_idxs[:-500]]
y_train = y[split_idxs[:-500]]
X_test = X[split_idxs[-500:]]
y_test = y[split_idxs[-500:]]

# Ridge Regression Classification
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train,y_train)

predictions = clf.predict(X_test);
print predictions
print clf.score(X_test,y_test)

text_file = open("Log.txt", "a")
ts = time.time()
text_file.write("Data collected from %s" % datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S\n'))
text_file.write("Prediction Score: %f\n\n" % clf.score(X_test,y_test))
text_file.close()
コード例 #20
0
    avg_train_score = 0
    avg_test_score = 0

    target_data_file = "targets_%s.dat" % target
    print "Starting to train a model to predict %s..." % target.replace('_', ' ')
    target_matrix = cPickle.load(open('2013-04-20 183207/' + target_data_file, 'r'))
    print "Converting targets to CSR Matrix to make life easier..."
    target_matrix = np.array(target_matrix)

    kf = KFold(len(target_matrix), n_folds=3, indices=True, shuffle=True)
    for train_index, test_index in kf:
        print "Beginning Fold"
        kfold_train = feature_matrix[train_index]
        kfold_test = feature_matrix[test_index]
        kfold_train_target = target_matrix[train_index]
        kfold_test_target = target_matrix[test_index]
        #clf = SGDRegressor(n_iter=1000, shuffle=True)
        clf = Ridge()
        clf.fit(kfold_train, kfold_train_target)

        score_train = clf.score(kfold_train, kfold_train_target)
        score_test = clf.score(kfold_test, kfold_test_target)

        print "R^2 Score On Training Data:", score_train
        avg_train_score += score_train
        print "R^2 Score On Validation Data:", score_test
        avg_test_score += score_test
    avg_train_score = avg_train_score/3.0
    avg_test_score = avg_test_score/3.0
    print "Average Score on Training Data:", avg_train_score
    print "Average Score on Testing Data:", avg_test_score
コード例 #21
0
                      cv=10)
search.fit(Xs, ys)
search.best_params_

# In[17]:

######Ridge
X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    ys,
                                                    test_size=0.2,
                                                    random_state=10)
ridge = Ridge(alpha=1, normalize=False)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
# # Compute and print R^2 and RMSE
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(" Test Root Mean Squared Error: {}".format(rmse))

# In[30]:

y0_pred = ridge.predict(X_test)
y1_pred = ridge.predict(X_train)
# # Compute and print R^2 and RMSE
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse0 = np.sqrt(mean_squared_error(y_test, y0_pred))
rmse = np.sqrt(mean_squared_error(y_train, y1_pred))
print("Root Mean Squared Error for Test: {}".format(rmse0))
print("Root Mean Squared Error for Train: {}".format(rmse))

# In[164]:
コード例 #22
0
class Regressor():
    """
    Wraps scikitlearn regressors.


    Parameters
    ----------

    strategy : string, defaut = "LightGBM" (if installed else "XGBoost")
        The choice for the regressor.
        Available strategies = "LightGBM" (if installed), "XGBoost",
        "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"

    **params : parameters of the corresponding regressor.
        Examples : n_estimators, max_depth...

    """
    def __init__(self, **params):

        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            if (lgbm_installed):
                self.__strategy = "LightGBM"
            else:
                self.__strategy = "XGBoost"

        self.__regress_params = {}

        self.__regressor = None
        self.__set_regressor(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False

    def get_params(self, deep=True):

        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__regress_params)

        return params

    def set_params(self, **params):

        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_regressor(params['strategy'])

            for k, v in self.__regress_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)

        for k, v in params.items():
            if (k == "strategy"):
                pass
            else:
                if k not in self.__regressor.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)
                    self.__regress_params[k] = v

    def __set_regressor(self, strategy):

        self.__strategy = strategy

        if (strategy == 'RandomForest'):
            self.__regressor = RandomForestRegressor(n_estimators=400,
                                                     max_depth=10,
                                                     max_features='sqrt',
                                                     bootstrap=True,
                                                     n_jobs=-1,
                                                     random_state=0)

        elif (strategy == 'XGBoost'):
            self.__regressor = XGBRegressor(n_estimators=500,
                                            max_depth=6,
                                            learning_rate=0.05,
                                            colsample_bytree=0.8,
                                            colsample_bylevel=1.,
                                            subsample=0.9,
                                            nthread=-1,
                                            seed=0)

        elif (strategy == "LightGBM"):
            if (lgbm_installed):
                self.__regressor = LGBMRegressor(n_estimators=500,
                                                 learning_rate=0.05,
                                                 colsample_bytree=0.8,
                                                 subsample=0.9,
                                                 nthread=-1,
                                                 seed=0)
            else:
                warnings.warn(
                    "Package lightgbm is not installed. Model LightGBM will be"
                    "replaced by XGBoost")
                self.__strategy = "XGBoost"
                self.__regressor = XGBRegressor(n_estimators=500,
                                                max_depth=6,
                                                learning_rate=0.05,
                                                colsample_bytree=0.8,
                                                colsample_bylevel=1.,
                                                subsample=0.9,
                                                nthread=-1,
                                                seed=0)

        elif (strategy == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor(n_estimators=400,
                                                   max_depth=10,
                                                   max_features='sqrt',
                                                   bootstrap=True,
                                                   n_jobs=-1,
                                                   random_state=0)

        elif (strategy == 'Tree'):
            self.__regressor = DecisionTreeRegressor(
                criterion='mse',
                splitter='best',
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                min_weight_fraction_leaf=0.0,
                max_features=None,
                random_state=0,
                max_leaf_nodes=None,
                presort=False)

        elif (strategy == "Bagging"):
            self.__regressor = BaggingRegressor(base_estimator=None,
                                                n_estimators=500,
                                                max_samples=.9,
                                                max_features=.85,
                                                bootstrap=False,
                                                bootstrap_features=False,
                                                n_jobs=-1,
                                                random_state=0)

        elif (strategy == "AdaBoost"):
            self.__regressor = AdaBoostRegressor(base_estimator=None,
                                                 n_estimators=400,
                                                 learning_rate=.05,
                                                 random_state=0)

        elif (strategy == "Linear"):
            self.__regressor = Ridge(alpha=1.0,
                                     fit_intercept=True,
                                     normalize=False,
                                     copy_X=True,
                                     max_iter=None,
                                     tol=0.001,
                                     solver='auto',
                                     random_state=0)

        else:
            raise ValueError(
                "Strategy invalid. Please choose between 'LightGBM' "
                "(if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', "
                "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")

    def fit(self, df_train, y_train):
        """

        Fits Regressor.

        Parameters
        ----------

        df_train : pandas dataframe of shape = (n_train, n_features)
        The train dataset with numerical features.

        y_train : pandas series of shape = (n_train, )
        The target for regression tasks.


        Returns
        -------
        self

        """

        # sanity checks
        if ((type(df_train) != pd.SparseDataFrame)
                and (type(df_train) != pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__regressor.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self

    def feature_importances(self):
        """
        Computes feature importances. Regressor must be fitted before.

        Parameters
        ----------

        None

        Returns
        -------

        importance : dict
            Dictionnary containing a measure of feature importance (value)
            for each feature (key).

        """

        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}
                f = np.abs(self.get_estimator().coef_)

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in [
                    "LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree"
            ]):

                importance = {}
                f = self.get_estimator().feature_importances_

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    # XGB, RF, ET, Tree and AdaBoost
                    # TODO: Refactor this part
                    f = sum(
                        weight * est.feature_importances_
                        for weight, est in zip(
                            self.get_estimator().estimator_weights_,
                            self.get_estimator().estimators_)) / norm  # noqa

                except Exception:
                    f = sum(weight * np.abs(est.coef_) for weight, est in zip(
                        self.get_estimator().estimator_weights_,
                        self.get_estimator().estimators_)) / norm  # noqa

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        # XGB, RF, ET, Tree and AdaBoost
                        f = b.feature_importances_
                    except Exception:
                        f = np.abs(b.coef_)  # Linear

                    estimator = self.get_estimator()
                    items = enumerate(estimator.estimators_features_[i])
                    for j, c in items:
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    importance[col] = np.mean(
                        filter(lambda x: x != 0, [
                            k[col] if col in k else 0 for k in importance_bag
                        ]))

            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")

    def predict(self, df):
        '''

        Predicts the target.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        y : array of shape = (n, )
        The target to be predicted.

        '''

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")

    def transform(self, df):
        '''

        Transforms df.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        df_transform : pandas dataframe of shape = (n, n_selected_features)
        The transformed dataset with its most important features.

        '''

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.transform(df.values)
        else:
            raise ValueError("You must call the fit function before !")

    def score(self, df, y, sample_weight=None):
        """

        Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        score : float
        R^2 of self.predict(df) wrt. y.

        """

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame)
                    and (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if (type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__regressor.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")

    def get_estimator(self):
        return copy(self.__regressor)
コード例 #23
0
# create and train a few models
lr = LinearRegression(normalize=True)
lr.fit(X_train, Y_train)

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, Y_train)

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, Y_train)

rfr = RandomForestRegressor()
rfr.fit(X_train, Y_train)

mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=1000)
mlp.fit(X_train, Y_train)

# print model accuracy and comparasion
from sklearn.metrics import accuracy_score

acc_lr = lr.score(X_test, Y_test)
acc_lasso = lasso.score(X_test, Y_test)
acc_ridge = ridge.score(X_test, Y_test)
acc_rfr = rfr.score(X_test, Y_test)
acc_mlp = mlp.score(X_test, Y_test)

print "LinearRegression: ", acc_lr
print "Lasso: ", acc_lasso
print "Ridge: ", acc_ridge
print "RandomForestRegressor: ", acc_rfr
print "MLPRegressor: ", acc_mlp
コード例 #24
0
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=3)
print len(X_test), len(y_test)
lr = LinearRegression()
lr.fit(X_train, y_train)
rr = Ridge(
    alpha=0.01
)  # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)
rr100 = Ridge(alpha=100)  #  comparison with alpha value
rr100.fit(X_train, y_train)
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train, y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train, y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)
print "linear regression train score:", train_score
print "linear regression test score:", test_score
print "ridge regression train score low alpha:", Ridge_train_score
print "ridge regression test score low alpha:", Ridge_test_score
print "ridge regression train score high alpha:", Ridge_train_score100
print "ridge regression test score high alpha:", Ridge_test_score100
# plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
# plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
# plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
# plt.xlabel('Coefficient Index',fontsize=16)
# plt.ylabel('Coefficient Magnitude',fontsize=16)
# plt.legend(fontsize=13,loc=4)
コード例 #25
0
# Code starts here
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
r2_lasso = lasso.score(X_test, y_test)
print(r2_lasso)

# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
r2_ridge = ridge.score(X_test, y_test)
print(r2_ridge)
# Code ends here

# --------------
from sklearn.model_selection import cross_val_score

#Code starts here
regressor = LinearRegression()

# Initiate cross validation score
score = cross_val_score(regressor, X_train, y_train, scoring='r2', cv=10)
print(score)
#calculate mean of the score
mean_score = np.mean(score)
コード例 #26
0
def create_model(df, y, X, X_train, X_test, y_train, y_test, degree,
                 random_state, test_size, alpha):

    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    ss = StandardScaler()
    ss.fit(X_train)

    X_train_scaled = ss.transform(X_train)
    X_test_scaled = ss.transform(X_test)

    linreg_norm = LinearRegression()
    linreg_norm.fit(X_train_scaled, y_train)

    X_cat = df[['Month', 'Origin', 'Dest']]
    X_train_cat, X_test_cat, y_train, y_test = train_test_split(
        X_cat, y, test_size=test_size, random_state=random_state)
    # OneHotEncode Categorical variables
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(X_train_cat)

    X_train_ohe = ohe.transform(X_train_cat)
    X_test_ohe = ohe.transform(X_test_cat)

    columns = ohe.get_feature_names(input_features=X_train_cat.columns)
    cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)
    cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns)
    X_train_all = pd.concat([pd.DataFrame(X_train_scaled), cat_train_df],
                            axis=1)
    X_test_all = pd.concat([pd.DataFrame(X_test_scaled), cat_test_df], axis=1)
    linreg_all = LinearRegression()
    linreg_all.fit(X_train_all, y_train)

    print('Baseline model Continuous and Categorical')
    print('Training r^2:', linreg_all.score(X_train_all, y_train))
    print('Testing r^2:', linreg_all.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, linreg_all.predict(X_train_all)))
    print('Testing MSE:',
          mean_squared_error(y_test, linreg_all.predict(X_test_all)))

    print("\n")

    lasso = Lasso(alpha=alpha)  #Lasso is also known as the L1 norm.
    lasso.fit(X_train_all, y_train)
    print('Lasso')
    print('Training r^2:', lasso.score(X_train_all, y_train))
    print('Testing r^2:', lasso.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, lasso.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             lasso.predict(X_test_all)))

    print("\n")

    ridge = Ridge(alpha=alpha)  #Ridge is also known as the L2 norm.
    ridge.fit(X_train_all, y_train)
    print('Ridge')
    print('Training r^2:', ridge.score(X_train_all, y_train))
    print('Testing r^2:', ridge.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, ridge.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             ridge.predict(X_test_all)))

    print("\n")

    poly_features = PolynomialFeatures(degree)

    # transforms the existing features to higher degree features.
    X_train_poly = poly_features.fit_transform(X_train)

    # fit the transformed features to Linear Regression
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)

    # predicting on training data-set
    y_train_predicted = poly_model.predict(X_train_poly)

    # predicting on test data-set
    y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))

    # evaluating the model on training dataset
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    r2_train = r2_score(y_train, y_train_predicted)

    # evaluating the model on test dataset
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict))
    r2_test = r2_score(y_test, y_test_predict)

    print("\n")

    print(" Polynomial training set")

    print("MSE of training set is {}".format(rmse_train))
    print("R2 score of training set is {}".format(r2_train))

    print("\n")

    print("Polynomial test set")

    print("MSE of test set is {}".format(rmse_test))
    print("R2 score of test set is {}".format(r2_test))

    print("\n")

    print('Cross Validation for Polynomial model')

    lm = LinearRegression()

    # store scores in scores object
    # we can't use accuracy as our evaluation metric since that's only relevant for classification problems
    # RMSE is not directly available so we will use MSE
    scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='r2')
    mse_scores = cross_val_score(lm,
                                 X_train_poly,
                                 y_train,
                                 cv=10,
                                 scoring='neg_mean_squared_error')
    print('Cross Validation Mean r2:', np.mean(scores))
    print('Cross Validation Mean MSE:', np.mean(mse_scores))
    print('Cross Validation 10 Fold Score:', scores)
    print('Cross Validation 10 Fold mean squared error', -(mse_scores))
コード例 #27
0
ファイル: m07_linear.py プロジェクト: sglee-vcanus/etc
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=66,
                                                    shuffle=True,
                                                    test_size=0.2)

from sklearn.linear_model import LinearRegression, Ridge, Lasso

# 모델
model1 = LinearRegression()
model2 = Ridge()
model3 = Lasso()

model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)

linear_score = model1.score(x_test, y_test)
ridge_score = model2.score(x_test, y_test)
lasso_score = model3.score(x_test, y_test)

# 평가
print('linear_score: ', linear_score)
print('ridge_score: ', ridge_score)
print('lasso_score: ', lasso_score)

# y_pred = model1.predict(x_test)
# print(y_pred)
コード例 #28
0
################################################## RIDGE REGRESSION

# PARAMETER TUNING

features = ['c1','c2','c3','c4','c5','c6','c7','c8']

msk = np.random.rand(len(tf)) < 0.8
train = tf[msk].reset_index(drop=True)
test = tf[~msk].reset_index(drop=True)

row_list = []

for n in range(0,1001):
    clf = Ridge(alpha=n)
    clf.fit(train[features],train.nrtg)
    score = clf.score(test[features],test.nrtg)
    dict1 = {'alpha':n,'score':score}
    row_list.append(dict1)
    
alpha_df = pd.DataFrame(row_list)

alpha = alpha_df[alpha_df.score == alpha_df.score.max()].alpha.values[0]

# RIDGE REGRESSION

clf = Ridge(alpha=alpha)

clf.fit(tf[features],tf.nrtg)

coefficients = clf.coef_
コード例 #29
0
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn

X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
ridge = Ridge().fit(X_train, y_train)
print("[default value of alpha]")
print("training set score: %f" % ridge.score(X_train, y_train))
print("test set score: %f" % ridge.score(X_test, y_test))

# Model with high value of alpha (regularization parameter)
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("[alpha 10]")
print("training set score: %f" % ridge10.score(X_train, y_train))
print("test set score: %f" % ridge10.score(X_test, y_test))

# Model with low value of alpha
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("[alpha 0.1]")
print("training set score: %f" % ridge01.score(X_train, y_train))
print("test set score: %f" % ridge01.score(X_test, y_test))

plt.title("ridge_coefficients")
plt.plot(ridge.coef_, 'o', label="Ridge alpha=1")
plt.plot(ridge10.coef_, 'o', label="Ridge alpha=10")
plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.ylim(-25, 25)
plt.legend()
plt.show()

pdx = wine_quality[all_colnms]
pdy = wine_quality["quality"]

x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42)

alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]

initrsq = 0

print ("\nRidge Regression: Best Parameters\n")
for alph in alphas:
    ridge_reg = Ridge(alpha=alph) 
    ridge_reg.fit(x_train,y_train)    
    tr_rsqrd = ridge_reg.score(x_train,y_train)
    ts_rsqrd = ridge_reg.score(x_test,y_test)    

    if ts_rsqrd > initrsq:
        print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5))
        initrsq = ts_rsqrd

# Coeffients of Ridge regression of best alpha value
ridge_reg = Ridge(alpha=0.001) 
ridge_reg.fit(x_train,y_train) 
 

print ("\nRidge Regression coefficient values of Alpha = 0.001\n")
for i in range(11):
    print (all_colnms[i],": ",ridge_reg.coef_[i])
コード例 #31
0
ファイル: 岭回归.py プロジェクト: HanKin2015/ACM
#plt.show()

data=pd.read_csv('ridge.csv')
#绘制车流量信息
plt.plot(data['TRAFFIC_COUNT'])
plt.show()

X=data[data.columns[1:5]]#属性数据
y=data['TRAFFIC_COUNT']#车流量数据(即是要预测的数据)
poly=PolynomialFeatures(5)#测试后5是效果较好的一个参数
#X为创建的多项式特征
X=poly.fit_transform(X)
#将所有数据划分为训练集和测试集,test_size表示测试集的比例,random_state是随机数种子
train_set_X, test_set_X , train_set_y, test_set_y = cross_validation.train_test_split(X,y,test_size=0.3,random_state=0)
#创建岭回归实例
clf=Ridge(alpha=1.0,fit_intercept = True)
#调用fit函数使用训练集训练回归器
clf.fit(train_set_X,train_set_y)
#利用测试集计算回归曲线的拟合优度,clf.score返回值为0.7375
#拟合优度,用于评价拟合好坏,最大为1,无最小值,当对所有输入都输出同一个值时,拟合优度为0。
clf.score(test_set_X,test_set_y)

start=200 #接下来我们画一段200到300范围内的拟合曲线
end=300
y_pre=clf.predict(X) #是调用predict函数的拟合值
time=np.arange(start,end)
plt.plot(time,y[start:end],'b', label="real")
plt.plot(time,y_pre[start:end],'r', label='predict')
#展示真实数据(蓝色)以及拟合的曲线(红色)
plt.legend(loc='upper left') #设置图例的位置
plt.show()
コード例 #32
0
        # Initialize scikit-learn ridge regression model
        model_ridge_scikit = RidgeRegression(alpha=alpha)

        # Trains scikit-learn ridge regression model
        model_ridge_scikit.fit(x_poly_train, y_train)

        print('Results for scikit-learn RidgeRegression model with alpha={}'.
              format(alpha))

        # Test model on training set
        score_mse_ridge_scikit_train = score_mean_squared_error(
            model_ridge_scikit, x_poly_train, y_train)
        print('Training set mean squared error: {:.4f}'.format(
            score_mse_ridge_scikit_train))

        score_r2_ridge_scikit_train = model_ridge_scikit.score(
            x_poly_train, y_train)
        print('Training set r-squared scores: {:.4f}'.format(
            score_r2_ridge_scikit_train))

        # Save MSE and R-squared training scores
        scores_mse_ridge_scikit_train.append(score_mse_ridge_scikit_train)
        scores_r2_ridge_scikit_train.append(score_r2_ridge_scikit_train)

        # Test model on validation set
        score_mse_ridge_scikit_val = score_mean_squared_error(
            model_ridge_scikit, x_poly_val, y_val)
        print('Validation set mean squared error: {:.4f}'.format(
            score_mse_ridge_scikit_val))

        score_r2_ridge_scikit_val = model_ridge_scikit.score(x_poly_val, y_val)
        print('Validation set r-squared scores: {:.4f}'.format(
コード例 #33
0
print ("Linear regression (order 5) score is: {0}".format(lr_5_model.score(X_test_poly, y_test)))

plt.plot(xx, yy_poly)
plt.plot(X_test, y_test, "o")
plt.ylim([0, 30])
plt.title("Linear regression (order 5) result")
plt.show()


ridge_model = Ridge(alpha=1, normalize=False)
ridge_model.fit(X_train_poly, y_train)
yy_ridge = ridge_model.predict(xx_poly)

# Todo: write to report
print ("Ridge regression (order 5) score is: {0}".format(ridge_model.score(X_test_poly, y_test)))
print ("y2= {0} + {1} x + {2} x*x + {3} x*x*x + {4} x*x*x*x +{5} x*x*x*x*x".
       format(ridge_model.intercept_[0], ridge_model.coef_[0][0], ridge_model.coef_[0][1], ridge_model.coef_[0][2],
              ridge_model.coef_[0][3], ridge_model.coef_[0][4]))

plt.plot(xx, yy_ridge)
plt.plot(X_test, y_test, "o")
plt.ylim([0, 30])
plt.title("Ridge regression (order 5) result")
plt.show()

# Compare
# 1. The model with the highest score is: Ridge model (order 5)
# 2. Ridge model can prevent over-fitting: yes
# 3. Ridge model is nearly equivalent to LR model (order 5) if alpha=0: yes
# 4. A larger alpha results in a larger coefficient for x*x*x*x*x: no
コード例 #34
0
    linreg.score(X_train, y_train)))
print("R-Squared Value for Test Set: {:.3f}".format(
    linreg.score(X_test, y_test)))

# KNeighborsRegressor
knnreg = KNeighborsRegressor(n_neighbors=2)
knnreg.fit(X_train, y_train)

print('R-squared train score: {:.3f}'.format(knnreg.score(X_train, y_train)))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))

# Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(ridge.score(
    X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(ridge.score(X_test, y_test)))

# Lasso
lasso = Lasso(max_iter=10000)
lasso.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(lasso.score(
    X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(lasso.score(X_test, y_test)))

lasso = Lasso(alpha=100, max_iter=10000)
lasso.fit(train_processed, train['revenue'])
results = lasso.predict(test_processed)
results_2 = np.exp(results)
print(results_2)
コード例 #35
0
ファイル: lab3_4.py プロジェクト: romax2000/data-processing
'''
В гребневой регрессии коэффициенты (w)
 выбираются не только с точки зрения того, 
 насколько хорошо они позволяют предсказывать 
 на обучающих данных, они еще подгоняются в 
 соответствии с дополнительным ограничением.
  Нам нужно, чтобы величина коэффициентов была как
 можно меньше. Другими словами, все элементы w
  должны быть близки к нулю. Это означает, 
  что каждый признак должен иметь как можно 
  меньшее влияние на результат 
'''

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.2f}".format(ridge.score(X_test, y_test)))

'''
Увеличение alpha заставляет коэффициенты сжиматься до близких к нулю значений, 
что снижает качество работы модели на обучающем наборе,
но может улучшить ее обобщающую способность. 
'''
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Правильность на тестовом наборе: {:.2f}".format(ridge10.score(X_test, y_test)))

#При очень малых значениях alpha, ограничение на коэффициенты практически не накладывается 
#и мы в конечном итоге получаем модель, напоминающую линейную регрессию
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Правильность на обучающем наборе: {:.2f}".format(ridge01.score(X_train, y_train)))
コード例 #36
0
#Evaluate the model
plt.figure(figsize=(15, 10))

ft_importances_lm.plot(kind='barh')
plt.show()

#R2 Value

print("RSquare Value for Simple Regresssion TEST data is-")
print(np.round(lm.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Lasso Regresssion TEST data is-")
print(np.round(lm_lasso.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Ridge Regresssion TEST data is-")
print(np.round(lm_ridge.score(features_test, labels_test) * 100, 2))

print("RSquare Value for Elastic Net Regresssion TEST data is-")
print(np.round(lm_elastic.score(features_test, labels_test) * 100, 2))

#Predict on test and training data

predict_test_lm = lm.predict(features_test)
predict_test_lasso = lm_lasso.predict(features_test)
predict_test_ridge = lm_ridge.predict(features_test)
predict_test_elastic = lm_elastic.predict(features_test)

#Print the Loss Funtion - MSE & MAE

import numpy as np
from sklearn import metrics
def prediction_ridge (X_train, Y_train, X_test, Y_test,alpha,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = Ridge (alpha = alpha,normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("Fit a model X_train, and calculate MSE with Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("Fit a model X_train, and calculate MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="Ridge %.3f  " %alpha
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result
コード例 #38
0
ファイル: ridge_basic.py プロジェクト: hjkim666/ml_basic

def load_extended_boston():
    boston = load_boston()
    X = MinMaxScaler().fit_transform(boston.data)
    X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
    return X, boston.target


X, y = load_extended_boston()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge.score(X_test, y_test)))

ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge10.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge10.score(X_test, y_test)))

ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("train accuracy: {:.2f}".format(ridge01.score(X_train, y_train)))
print("test accuracy: {:.2f}".format(ridge01.score(X_test, y_test)))

lr = LinearRegression().fit(X_train, y_train)

plt.plot(ridge10.coef_, '^', label="Ridge alpha=10")
plt.plot(ridge.coef_, 's', label="Ridge alpha=1")
plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1")
コード例 #39
0
from sklearn.linear_model import Ridge

ridge = Ridge()

ridge.fit(X_train, y_train)


# In[12]:

pred_test = ridge.predict(X_test)
pred_test


# In[13]:

ridge.score(X_test, y_test)


# In[14]:

#MSE

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)


# In[16]:

#RandomForestRegresion

from sklearn.ensemble import RandomForestRegressor
コード例 #40
0
x_train, x_test, y_train, y_test = train_test_split(sal_munged, y, test_size=0.3)
x_train = x_train.reshape(-1, x_train.shape[1])

regr = Ridge().fit(x_train, y_train)


### MODEL PERFORMANCE ###

# The Mean Squared Error
print("Mean Squared Error, training data: %d" % np.mean((regr.predict(x_train) - y_train) ** 2))
print("Mean Squared Error, test data: %d" % np.mean((regr.predict(x_test) - y_test) ** 2))
print(30 * "* ")

# Variance score
print("Variance score, training data: %.2f" % regr.score(x_train, y_train))
print("Variance score, test data: %.2f" % regr.score(x_test, y_test))
print(30 * "* ")

### GRAPHS: DISTRIBUTION OF ERROR ###
print("Distribution of prediction error on training data:")
predError = regr.predict(x_train) - y_train
plt.hist(predError)
plt.xlim(-80000, 80000)
plt.show()

print("Distribution of prediction error on test data:")
predError = regr.predict(x_test) - y_test
plt.hist(predError)
plt.xlim(-80000, 80000)
plt.show()
コード例 #41
0
    def TrainModel(self):
        self.browser.clear()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test
        X_train1, X_test1, y_train1, y_test1 = X_train.values, X_test.values, y_train.values, y_test.values

        y_train2 = y_train1.reshape(-1, 1)
        y_test2 = y_test1.reshape(-1, 1)

        scalerX = preprocessing.StandardScaler().fit(X_train1)
        scalery = preprocessing.StandardScaler().fit(y_train2)

        X_train3 = scalerX.transform(X_train1)
        X_test3 = scalerX.transform(X_test1)
        y_train3 = scalery.transform(y_train2)
        y_test3 = scalery.transform(y_test2)

        self.browser.append("Load Dataset")
        self.browser.append("")
        self.browser.append("")

        # LinearRegression Model
        lm = LinearRegression()
        lm.fit(X_train, y_train)
        y_pred_lm = lm.predict(X_test)
        acc_lm_train = round(lm.score(X_train, y_train) * 100, 2)
        acc_lm_test = round(lm.score(X_test, y_test) * 100, 2)
        self.browser.append("<LinearRegression Model>")
        self.browser.append("Train acc : " + str(acc_lm_train) + "%")
        self.browser.append("Test acc : " + str(acc_lm_test) + "%")
        self.browser.append("")
        #time.sleep(3)

        # Ridge Regression Model
        ridge = Ridge(alpha=0.1)
        ridge.fit(X_train, y_train)
        y_pred_ridge = ridge.predict(X_test)
        acc_ridge_train = round(ridge.score(X_train, y_train) * 100, 2)
        acc_ridge_test = round(ridge.score(X_test, y_test) * 100, 2)
        self.browser.append("<Ridge Regression Model>")
        self.browser.append("Train acc : " + str(acc_ridge_train) + "%")
        self.browser.append("Test acc : " + str(acc_ridge_test) + "%")
        self.browser.append("Used Coefficient : " +
                            str(np.sum(ridge.coef_ != 0)))
        self.browser.append("")
        #time.sleep(3)

        # Lasso Regression Model
        lasso = Lasso(alpha=0.1, max_iter=100000)
        lasso.fit(X_train, y_train)
        y_pred_lasso = lasso.predict(X_test)
        acc_lasso_train = round(lasso.score(X_train, y_train) * 100, 2)
        acc_lasso_test = round(lasso.score(X_test, y_test) * 100, 2)
        self.browser.append("<Lasso Regression Model>")
        self.browser.append("Train acc : " + str(acc_lasso_train) + "%")
        self.browser.append("Test acc : " + str(acc_lasso_test) + "%")
        self.browser.append("Used Coefficient : " +
                            str(np.sum(lasso.coef_ != 0)))
        self.browser.append("")

        # SGD Regression
        sgd = SGDRegressor(loss="squared_loss",
                           penalty=None,
                           random_state=42,
                           max_iter=100000)
        sgd.fit(X_train3, y_train3)
        y_pred_sgd = sgd.predict(X_test3)
        acc_sgd_train = round(sgd.score(X_train3, y_train3) * 100, 2)
        acc_sgd_test = round(sgd.score(X_test3, y_test3) * 100, 2)
        self.browser.append("<Stochastic Gradient Descent Regression>")
        self.browser.append("Train acc : " + str(acc_sgd_train) + "%")
        self.browser.append("Test acc : " + str(acc_sgd_test) + "%")
        self.browser.append("")

        # Decision Tree's
        etr = ExtraTreesRegressor()
        etr.fit(X_train, y_train)
        y_pred_etr = etr.predict(X_test)
        acc_etr_train = round(etr.score(X_train, y_train) * 100, 2)
        acc_etr_test = round(etr.score(X_test, y_test) * 100, 2)
        self.browser.append("<Extra Trees Regressor(Random Forest)>")
        self.browser.append("Train acc : " + str(acc_etr_train) + "%")
        self.browser.append("Test acc : " + str(acc_etr_test) + "%")
        self.browser.append("")

        #SVR
        svr = SVR()
        svr.fit(X_train3, y_train3)
        y_pred_svr = svr.predict(X_test3)
        acc_svr_train = round(svr.score(X_train3, y_train3) * 100, 2)
        acc_svr_test = round(svr.score(X_test3, y_test3) * 100, 2)
        self.browser.append("<Support Vector Machine>")
        self.browser.append("Train acc : " + str(acc_svr_train) + "%")
        self.browser.append("Test acc : " + str(acc_svr_test) + "%")
        self.browser.append("")

        models = pd.DataFrame({
            'Model': [
                'LinearRegression', 'Ridge Regression', 'Lasso Regression',
                'SGD Regression', 'Extra Trees Regressor',
                'Support Vector Machine'
            ],
            'Score': [
                acc_lm_test, acc_ridge_test, acc_lasso_test, acc_sgd_test,
                acc_etr_test, acc_svr_test
            ]
        })

        models.sort_values(by='Score', ascending=True)
        models = PandasModelTrainData(models)
        self.tableView = QTableView()
        self.tableView.setSortingEnabled(True)
        self.tableView.setModel(models)
        self.tableView.setGeometry(850, 100, 320, 400)
        self.tableView.setColumnWidth(0, 200)
        self.tableView.sortByColumn(1, Qt.DescendingOrder)
        self.tableView.setWindowTitle("Accuracy")
        self.tableView.show()
コード例 #42
0
def ridge_reg(X,Y,data_file,p=False):
    """
    Does ridge regression on the data provided

    Inputs
    ------
    X :         Coulumns of the pandas dataframe that contains the data for each
                of the descriptors to be used

    Y :         Column of the pandas dataframe that contains the values to be
                predicted

    data_file : String containing the name of the file the model statistics will
                be stored in, where the RMSE and R-Squared values for each model
                will be stored

    Outputs
    -------
    coefs :     Contains a list of the coefficient for each descriptor used
    """

    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=3)

    high_score = 0
    alpha_ = 0
    #coefs = np.zeros(19)

    rr0001 = Ridge(alpha=0.001)
    rr0001.fit(X_train, y_train)
    Ridge_train_score0001 = rr0001.score(X_train,y_train)
    Ridge_test_score0001 = rr0001.score(X_test, y_test)
    high_score = Ridge_test_score0001
    alpha_ = 0.001
    coefs = rr0001.coef_
    pred = rr0001.predict(X_test)
    rmse = np.sqrt(MSE(y_test, pred))

    rr001 = Ridge(alpha=0.01)
    rr001.fit(X_train, y_train)
    Ridge_train_score001 = rr001.score(X_train,y_train)
    Ridge_test_score001 = rr001.score(X_test, y_test)
    if(Ridge_test_score001 > high_score):
        high_score = Ridge_test_score001
        alpha_ = 0.01
        coefs = rr001.coef_
        pred = rr001.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr01 = Ridge(alpha=0.1)
    rr01.fit(X_train, y_train)
    Ridge_train_score01 = rr01.score(X_train,y_train)
    Ridge_test_score01 = rr01.score(X_test, y_test)
    if(Ridge_test_score01 > high_score):
        high_score = Ridge_test_score01
        alpha_ = 0.1
        coefs = rr01.coef_
        pred = rr01.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr10 = Ridge(alpha=10)
    rr10.fit(X_train, y_train)
    Ridge_train_score10 = rr10.score(X_train,y_train)
    Ridge_test_score10 = rr10.score(X_test, y_test)
    if(Ridge_test_score10 > high_score):
        high_score = Ridge_test_score10
        alpha_ = 10
        coefs = rr10.coef_
        pred = rr10.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr100 = Ridge(alpha=100)
    rr100.fit(X_train, y_train)
    Ridge_train_score100 = rr100.score(X_train,y_train)
    Ridge_test_score100 = rr100.score(X_test, y_test)
    if(Ridge_test_score100 > high_score):
        high_score = Ridge_test_score100
        alpha_ = 100
        coefs = rr100.coef_
        pred = rr100.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    rr1000 = Ridge(alpha=1000)
    rr1000.fit(X_train, y_train)
    Ridge_train_score1000 = rr1000.score(X_train,y_train)
    Ridge_test_score1000 = rr1000.score(X_test, y_test)
    if(Ridge_test_score1000 > high_score):
        high_score = Ridge_test_score1000
        alpha_ = 1000
        coefs = rr1000.coef_
        pred = rr1000.predict(X_test)
        rmse = np.sqrt(MSE(y_test, pred))

    data_file.write('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score))
    data_file.write('\n\t\t\tRMSE: \t\t%f' % (rmse))

    if(p==True):
        print('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score))
        print('\n\t\tRMSE: \t\t%f' % (rmse))

    return np.concatenate((rr001.coef_, rr10.coef_, rr100.coef_, rr1000.coef_), axis=0), np.array(coefs)
コード例 #43
0
ファイル: pca_svr.py プロジェクト: abrinkmacmu/ML_Project2
    X1 = X_train_reduced[train]
    Y1 = Y_train_raw[train]

    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]

    ## Train Classifiers on fold
    rdg_clf = Ridge(alpha=0.5)
    rdg_clf.fit(X1, Y1)
    lso_clf = Lasso(alpha=0.6257)
    lso_clf.fit(X1, Y1)
    svr_clf = LinearSVR(C=1e3)
    svr_clf.fit(X1, Y1)

    ## Score Classifiers on fold
    rdg_clf_score = rdg_clf.score(X2, Y2)
    lso_clf_score = lso_clf.score(X2, Y2)
    svr_clf_score = svr_clf.score(X2, Y2)

    print "Ridge:  ", rdg_clf_score
    print "Lasso:  ", lso_clf_score
    print "SVR_RBF:  ", svr_clf_score


## Train final Classifiers
# clf = Ridge(alpha=.5)
clf = LinearSVR(C=1e3, gamma=0.1)
clf.fit(X_train_reduced, Y_train_raw)
Y_predicted = clf.predict(X_test_reduced)

## Save results to csv
コード例 #44
0
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

train = train.fillna(0.0)
test = test.fillna(0.0)

x_train = df
y_train = x_train.nota.values
del x_train['nota']


hyperparams = {'alpha':[0.0005, 0.0014, 0.0006, 0.00061, 0.000612, 0.000613001, 0.000614, 0.00061401, 0.00061402, 0.00061403, 0.0006104 ]}
gs = GridSearchCV(estimator=Ridge(normalize=True), param_grid=hyperparams)
gs.fit(x_train, y_train)
pred = pd.Series(gs.predict(test))
err = gs.score(x_train, y_train)
print('Result:')
print('Best parameter: ',gs.best_params_)
print('Best score: ',gs.best_score_)
print('Root mean square logarithmic error: ', err)
print('\n')

ridge2 = Ridge(alpha = 0.0005, normalize=True)	
ridge2.fit(x_train, y_train)
print(ridge2.score(x_train, y_train))
result = pd.DataFrame(ridge2.predict(test), index = test.index, columns=['nota'])
print result
#result = result.drop_duplicates(subset='atleta_id', keep="last")
#result['atleta_id'] = result['atleta_id'].apply(lambda x:str(x))
result.to_csv('submission.csv')

コード例 #45
0
#################
#Regularization
##################

#Ridge regression (L2) Penalty (alpha Regularization Parameter)
#Ridge Regression leads to dense solutions, in which most coefficients are non-zero

from sklearn.linear_model import Ridge
ridge_models = {}
training_scores = []
test_scores = []

for alpha in [100, 10, 1, .01]:
    ridge = Ridge(alpha=alpha).fit(X_train, y_train)
    training_scores.append(ridge.score(X_train, y_train))
    test_scores.append(ridge.score(X_test, y_test))
    ridge_models[alpha] = ridge

plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [100, 10, 1, .01])
plt.legend(loc="best")


#Lasso (L1) Penalty (alpha Regularization Parameter)
#LASSO leads to sparse solutions, driving most coefficients to zero
from sklearn.linear_model import Lasso

lasso_models = {}
training_scores = []
コード例 #46
0
ファイル: boston.py プロジェクト: mkorczynska/UM
# # errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# # -------------------------------------

# --- RIDGE REGRESSION ---  #
boston_rr = Ridge()
boston_rr.fit(X_train, y_train)
print("Coefficients: ", boston_rr.coef_)
print("Intercept: ", boston_rr.intercept_)

# R for train and test set
print('R2 for train: ', boston_rr.score(X_train, y_train))
print('R2 for test: ', boston_rr.score(X_test, y_test))

# ridge regression - prediction
y_pred = boston_rr.predict(X_test)
df = pd.DataFrame({'actual': y_test, 'pred': y_pred})
print(df)

print(pd.DataFrame(boston_rr.coef_))

# errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# -------------------------------------
コード例 #47
0
np.random.seed(42)
x = np.linspace(0, 20, 21)
y = 5 * x + 2 + np.random.normal(0.0, 20.0, 21)

# Hint: if you get a shape error from scikit, try:
X = x.reshape(21, 1)

poly = PolynomialFeatures(30)
Xpoly = poly.fit_transform(X)
Xscaled = MinMaxScaler().fit_transform(Xpoly)

Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, random_state=42)

m = Ridge(alpha=0.1)
m.fit(Xtrain, ytrain)
m.score(Xtrain, ytrain)
m.score(Xtest, ytest)
ypred = m.predict(Xscaled)

plt.bar(range(31), m.coef_)

plt.plot(Xtrain[:, 1], ytrain, 'bo')
plt.plot(Xtest[:, 1], ytest, 'kx')
plt.plot(Xscaled[:, 1], ypred, 'r-')
plt.axis([0.0, 1.0, 0.0, 140.0])

plt.plot(Xtrain[:, 1], ytrain, 'bo')
plt.plot(Xtest[:, 1], ytest, 'kx')
plt.plot(x, y, 'bo')
plt.plot(x, ypred, 'r-')
plt.axis([2.0, 20.0, 20.0, 140.0])
print("accuracy",ac1)
y_pred1=model2.predict(X_test)
print("prediction",y_pred1)
#VISIULIZATION
plt.scatter(x1,y1,color='red')
plt.plot(x1,model2.predict(pol_reg.fit_transform(x1)),color='blue')
plt.tittle("Truth or bbluff (linear regression)")
plt.xlabel("squarfit_living")
plt.ylabel("price")
plt.show()
#-------------Above model is overfitted--------------------

#to avoid over fitting ridge regression require
#apply ridge regression
from sklearn.linear_model import Ridge
ridmodel=Ridge(alpha=0.000000000000005,normalize=True)
ridmodel.fit(X_train,y_train)
rid_pre=ridmodel.predict(X_test)
print(rid_pre)
ac2=ridmodel.score(X_test,y_test)
print("accuracy",ac2)
#Data visiulization
plt.scatter(x1,y1,color='red')
plt.plot(x1,ridmodel.predict(pol_reg.fit_transform(x1)),color='blue')
plt.tittle("Truth or bbluff (linear regression)")
plt.xlabel("squarfit_living")
plt.ylabel("price")
plt.show()


コード例 #49
0
train_x,text_x,train_y,text_y = cross_validation.train_test_split(X1,y1,train_size=0.5,random_state=1)

#f_fold = StratifiedKFold(y=y1,n_folds=10,random_state=1)
f_fold = KFold(len(y1),n_folds=10,random_state=0)

score = []
mean_square_score_train= []
mean_square_score_test = []
r2_score_train = []
r2_score_test = []

train_stuff=[]
test_stuff=[]
for k, (train,text) in enumerate(f_fold):
    predictor.fit(X1[train],y1[train])
    c = predictor.score(X1[text],y1[text])
    score.append(c)
    mean_square_score_train.append(mean_squared_error(y1[train],predictor.predict(X1[train])))
    mean_square_score_test.append(mean_squared_error(y1[text],predictor.predict(X1[text])))
    r2_score_train.append(r2_score(y1[train],predictor.predict(X1[train])))
    r2_score_test.append(r2_score(y1[text],predictor.predict(X1[text])))
    print "percentage within 7 days error for training data  " + str(
        sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100)
    print "percentage within 7 days error for testing data   " + str(
        sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100)
    print "-------------------"
    train_stuff.append(sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100)
    test_stuff.append(sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100)


コード例 #50
0
#model generation and prediction
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#Linear Regression
clfreg = LinearRegression(n_jobs=1)
clfreg.fit(X_train,y_train)
y_pred = clfreg.predict(X_test)
confidencereg = clfreg.score(X_test,y_test)

#Ridge Regression
rr = Ridge(alpha=0.01)
rr.fit(X_train,y_train)
y_pred_ridge = rr.predict(X_test)
confidenceridge = rr.score(X_test,y_test)

#Lasso Regression
ls = Lasso()
ls.fit(X_train,y_train)
y_pred_lasso = ls.predict(X_test)
confidencelasso = ls.score(X_test,y_test)

#plotting learning curves for linear regression
import matplotlib.pyplot as plt
plt.plot(y_test[:100])
plt.plot(y_pred[:100])
plt.legend(['Actual', 'Linear Predicted'], loc='upper right')
plt.show()

コード例 #51
0
 def RidgReg(self):
     r=Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)
     r=r.fit(self.exec_data_X,self.exec_data_Y)
     print("Score for Ridge Regression",end=" ")
     print(r.score(self.exec_data_X,self.exec_data_Y))
コード例 #52
0
                           param_grid=parameters,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)
grid_search = grid_search.fit(X_poly[:, 1:], y_train)
best_mse = grid_search.best_score_
best_parameters = grid_search.best_params_

ridgeReg = Ridge(fit_intercept=False,
                 normalize=True,
                 alpha=0.01,
                 tol=1e-5,
                 max_iter=13000,
                 solver='auto')
ridgeReg.fit(X_poly, y_train)
y_pred = ridgeReg.predict(X_poly)
sums = (y_pred - y_train)**2
sums = (np.sum(sums)) / len(y_pred)
score = ridgeReg.score(X_poly, y_train)
print(f'Training error {round(sums * (10**3),3) }')
print(f'Traning Score {round(score,3)} \n')

prediction = cross_val_predict(ridgeReg, X_poly, y_train, cv=5)
sums = (prediction - y_train)**2
sums = (np.sum(sums)) / len(prediction)
accuracies = cross_val_score(estimator=ridgeReg, X=X_poly, y=y_train, cv=5)

print(f'Validation error {round(sums * (10**3),3) }')
print(f'Validation Score {round(accuracies.mean(),3)} \n')
コード例 #53
0
def main():
    usage = 'usage: %prog [options] <fasta> <scores>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='canonical_kmers', default=False, action='store_true', help='Count canonical k-mers [Default: %default]')
    parser.add_option('--alpha', dest='alpha', default=None, type='float', help='Regularization alpha parameter. Will choose via CV if not specified [Default: %default]')
    parser.add_option('-c', dest='cv_folds', default=0, type='int', help='Cross-validate with this many folds [Default: %default]')
    parser.add_option('--epsilon', dest='epsilon', default=None, type='float', help='Regularization epsilon parameter. Will choose via CV if not specified [Default: %default]')
    parser.add_option('-g', dest='gaps', default=0, type='int', help='Gaps in k-mers string kernel [Default: %default]')
    parser.add_option('-k', dest='k', default=4, type='int', help='K-mer size for string kernel [Default: %default]')
    parser.add_option('-l', dest='length', default=False, action='store_true', help='Add log2 sequence length as an attribute [Default: %default]')
    parser.add_option('-m', dest='method', default='ols', help='Regression method [Default: %default]')
    parser.add_option('-o', dest='output_file', default='seq_regr.txt', help='Output file [Default: %default]')
    parser.add_option('-w', dest='whiten', default=False, action='store_true', help='Whiten the sequence scores [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide fasta file and scores file')
    else:
        fasta_file = args[0]
        scores_file = args[1]

    ##################################################
    # convert sequences to feature representations
    ##################################################
    seq_vectors = fasta_string_kernel(fasta_file, options.k, options.gaps, options.canonical_kmers)

    if options.length:
        add_length_feature(seq_vectors, fasta_file)

    ##################################################
    # read scores
    ##################################################
    seq_scores = {}

    scores_in = open(scores_file)

    try:
        line = scores_in.readline()
        a = line.split()
        seq_scores[a[0]] = float(a[1])
    except:
        # possible header line
        pass

    for line in scores_in:
        a = line.split()
        seq_scores[a[0]] = float(a[1])


    ##################################################
    # make scikit-learn data structures
    ##################################################
    # shitty method filling in the dense matrix
    kmers = set()
    for kmer_vec in seq_vectors.values():
        kmers |= set(kmer_vec.keys())

    kmers_sort = sorted(kmers)

    seq_headers = sorted(seq_vectors.keys())

    X = np.array([[seq_vectors[header].get(kmer,0) for kmer in kmers_sort] for header in seq_headers])
    y = np.array([seq_scores[header] for header in seq_headers])

    if options.whiten:
        y = preprocessing.scale(y)

    ##################################################
    # decide method
    ##################################################
    if options.method.lower() == 'ols':
        model = LinearRegression()

    elif options.method.lower() == 'pls':
        model = PLSRegression(n_components=2)

    elif options.method.lower() == 'ridge':
        if options.alpha:
            # model = Ridge(alpha=options.alpha)
            model = RidgeCV(alphas=[options.alpha], store_cv_values=True)
        else:
            #model = RidgeCV(alphas=[0.0001, 0.0002, 0.0004, 0.0008, .0016, 0.0032, 0.0064, .0128], store_cv_values=True)
            model = RidgeCV(alphas=[0.0004, 0.0008, 0.0016, 0.0032], store_cv_values=True)

    elif options.method.lower() == 'svm':
        if options.alpha:
            svm_c = len(y) / options.alpha
        else:
            svm_c = 100
        if options.epsilon:
            svm_eps = options.epsilon
        else:
            svm_eps = 0.5

        model = SVR(kernel='linear', degree=3, C=svm_c, epsilon=svm_eps)

    elif options.method.lower() == 'gp':
        model = GaussianProcess()

    else:
        print >> sys.stderr, 'Method not recognized.'
        exit(1)


    ##################################################
    # learn model
    ##################################################
    model.fit(X, y)

    ss_tot = sum(np.square(y - np.mean(y)))

    if options.method.lower() == 'ridge':
        for i in range(len(model.alphas)):
            score_cv = (1.0 - sum(model.cv_values_[:,i])/ss_tot)
            print >> sys.stderr, 'RidgeCV alpha=%.5f score=%f' % (model.alphas[i], score_cv)

    ##################################################
    # cross-validate
    ##################################################
    if options.cv_folds > 0:
        scores = []
        ss_reg = 0

        if options.method.lower() == 'ridge':
            model_cv = Ridge(alpha=model.alpha_)
        else:
            model_cv = copy.copy(model)

        kf = KFold(len(y), n_folds=options.cv_folds, shuffle=True)
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

            # learn on train
            model_cv.fit(X[train], y[train])

            # score on test
            scores.append(model_cv.score(X_test, y_test))

            ss_reg += sum(np.square(y_test - model_cv.predict(X_test)))

        score_cv = 1 - ss_reg / ss_tot


    ##################################################
    # output model information
    ##################################################
    model_out = open(options.output_file, 'w')

    print('Score\t%.3f' % model.score(X, y), file=model_out)
    if options.cv_folds > 0:
        print >> model_out, 'ScoreCV\t%.3f' % score_cv
        if options.method.lower() == 'ridge' and options.alpha:
            score_cv = (1.0 - sum(model.cv_values_)/ss_tot)
            print('ScoreCV\t%.3f' % score_cv, file=model_out)

    for i in range(len(kmers_sort)):
        if options.method.lower() == 'pls':
            coef_i = model.coefs[i]
        else:
            coef_i = model.coef_[i]

        print('%s\t%f' % (kmers_sort[i], coef_i), file=model_out)

    model_out.close()
コード例 #54
0
import mglearn
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


X, y = mglearn.datasets.load_extended_boston()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LinearRegression().fit(X_train, y_train)

ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

# Training set score: 0.89
# Test set score: 0.75

ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))
# Training set score: 0.79
# Test set score: 0.64

ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test)))
# Training set score: 0.93
コード例 #55
0
X=Salesdf[['perishable', 'item_nbr', 'store_nbr', 'cluster']]
y=Salesdf[["unit_sales"]]

reg=linear_model.LinearRegression()
cv_results=cross_val_score(reg,X_train,y_train,cv=5)
print(cv_results)
print(np.mean(cv_results))
print(np.std(cv_results))
#Using cross validation of score 5


ridge = Ridge(alpha=0.1, normalize = True)
ridge.fit(X_train,y_train)
ridge_pred=ridge.predict(X_test)
ridge.score(X_test,y_test)
#The score is pretty much similar to the linear model built which ensures that the model has passed the Ridge regression test
# for regularization
#Ridge is used to penalize the loss function by adding the OLS loss function to the square of each coefficient multiplied by alpha.    











コード例 #56
0
print("number of test samples:", x_test.shape[0])
print("number of training samples:", x_train.shape[0])

# ### Question 9
# Create and fit a Ridge regression object using the training data, set the regularization parameter to 0.1, and calculate the R^2 using the test data.
#

# In[26]:

from sklearn.linear_model import Ridge

# In[27]:

RigeModel = Ridge(alpha=0.1)
RigeModel.fit(x_train, y_train)
RigeModel.score(x_test, y_test)

# ### Question 10
# Perform a second order polynomial transform on both the training data and testing data. Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2.

# In[28]:

pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train[features])
x_test_pr = pr.fit_transform(x_test[features])

RigeModel = Ridge(alpha=0.1)
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)

# <p>Once you complete your notebook you will have to share it. Select the icon on the top right a marked in red in the image below, a dialogue box should open, and select the option all&nbsp;content excluding sensitive code cells.</p>
コード例 #57
0
ファイル: HP-RLR.py プロジェクト: WWbigdata902/Kaggle
print("Minimum Error for Ridge Model: ", minimum_error)
print("Minimum Error for Lasso Model: ", minimum_error_lasso)
def ord_to_char(v, p=None):
    return chr(int(v))
    
#Picking up Ridge Model & figuring 10 most useful and 10 least useful parameters for Housing Price Prediction
ridgeReg = Ridge()
ridgeReg.fit(X,Y)
coef = pd.Series(ridgeReg.coef_, index = X.columns)
relevant_Coeff = coef.sort_values().tail(10)
irrelevant_Coeff = coef.sort_values().head(10)

#Plots
plt.figure(figsize=(20,10))
relevant_Coeff.plot(kind = "barh", title="Most Relevant Aspects of a House")

plt.figure(figsize=(20,10))
irrelevant_Coeff.plot(kind = 'barh', title="Least Relevant Aspects of a House")

#Remaining Feature Set
plt.figure(figsize= (50,10))
preds = pd.DataFrame({"Predicted":ridgeReg.predict(X), "true":Y})
preds["Difference"] = preds["true"] - preds["Predicted"]
preds.plot(x = "Predicted", y = "Difference",kind = "scatter", title = "Residual Features")

print (ridgeReg.score(X,Y))

preds = np.expm1(ridgeReg.predict(X_test)) #Exponential function used to balance out log(x + 1) 
solution = pd.DataFrame({"id":test_DF.Id, "SalePrice":preds})
solution.to_csv("ridge_sol.csv", index = False)
コード例 #58
0
    X_final, Y_final = select_Y(final, 19)
    X_final = select_atributes(X_final, vektors[10])
    X_test, Y_test = select_Y(test, 19)
    X_test = select_atributes(X_test, vektors[10])
    poly = preprocessing.PolynomialFeatures(2)
    X_final = poly.fit_transform(X_final)
    X_test = poly.fit_transform(X_test)

    scaler = StandardScaler()
    scaler = scaler.fit(X_final)
    X_final = scaler.transform(X_final)
    X_test = scaler.transform(X_test)

    trained = RDG.fit(X_final, Y_final)
    Y_predict = RDG.predict(X_test)
    print(RDG.score(X_final, Y_final))
    print(RDG.score(X_test, Y_test))
    Y_mean = np.mean(Y_final)
    r2 = mtrcs.r2_score(Y_test, Y_predict)
    mae = mtrcs.mean_absolute_error(Y_test, Y_predict)
    mse = mtrcs.mean_squared_error(Y_test, Y_predict)
    mae_predict = np.mean(np.abs(Y_test - Y_mean))
    mse_predict = np.mean(np.power(np.abs(Y_test - Y_mean), 2))
    msg = "%20s: %10f %10f %10f %10f %10f %10f %10f" % (
        "Testing results r2score,MAE,MSE,MAE diff", r2, mae, mse, mae_predict,
        mse_predict, mae_predict - mae, mse_predict - mse)
    print(msg)
    X_plot = select_atributes(final, vektor)
    for i in range(len(header)):
        print(header[i])
        print(RDG.coef_[0, i + 1])
コード例 #59
0
# train[:,1:] = log10(nbaData[:,1:])

regression = Ridge(alpha=0.05)



kf = KFold(len(train),k=10)
avgResiduSum = 0
avgVar = 0
for tr, e in kf:
    regression.fit(train[tr,1:],train[tr,0])

    avgResiduSum += mean((regression.predict(train[e,1:]) - train[e,0]) ** 2)

    # Explained variance score: 1 is perfect prediction
    avgVar +=regression.score(train[e,1:] , train[e,0])

print '############'
print 'Evaluation Phase'
avgResiduSum = avgResiduSum/len(kf)
print("Average Residual sum of squares: %.2f" % avgResiduSum )
avgVar = avgVar/len(kf)
print('Average Variance score: %.2f' % avgVar)

print '############'
print 'Testing Phase'
regression.fit(train[:,1:],train[:,0])
print("Residual sum of squares: %.2f"
      % mean((regression.predict(nba15test_scaled[:,1:]) - nba15test_scaled[:,0]) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regression.score(nba15test_scaled[:,1:] , nba15test_scaled[:,0]))
X = df[features]
Y = df['price']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)


print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

#Question 9
#Create and fit a Ridge regression object using the training data,
#set the regularization parameter to 0.1, and calculate the R^2 using the test data.
from sklearn.linear_model import Ridge

RidgeModel = Ridge(alpha = 0.1)
RidgeModel.fit(x_train, y_train)
RidgeModel.score(x_test, y_test)

#Question 10
#Perform a second order polynomial transform on both the training data and testing data. 
#Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, 
#and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2.

SecondOrderPolynomialTransform = PolynomialFeatures(degree=2)
x_train_transformed = SecondOrderPolynomialTransform.fit_transform(x_train)
x_test_transformed = SecondOrderPolynomialTransform.fit_transform(x_test)

NewRidgeModel = Ridge(alpha = 0.1)
NewRidgeModel.fit(x_train_transformed, y_train)
NewRidgeModel.score(x_test_transformed, y_test)