コード例 #1
0
ファイル: cv_trainer.py プロジェクト: cypressf/robo-cv
def find_best_fit(bagfiles):
    '''Crop ROS  bag files at begining and end to only use the portion 
    in which the robot is moving. 
    Fit the Ridge regression to these cropped bag files. 
    '''
    clf = Ridge(alpha=1.0)  # TODO: auto-calibrate alpha (it's easy using a scikit-learn one-liner)
    image_data = []
    cmd_vel_data = []
    for bagfile_path in bagfiles:
        most_recent_cmd_vel = None
        bag = rosbag.Bag(bagfile_path)
        for topic, msg, t in bag.read_messages(topics=['/camera/image_raw/compressed', '/cmd_vel'], ):
            if topic == "/cmd_vel" and ((most_recent_cmd_vel is not None) or msg.linear.x>0):
                if most_recent_cmd_vel is None and msg.linear.x > 0:
                    most_recent_cmd_vel = msg
                elif most_recent_cmd_vel is not None and msg.linear.x == 0 and msg.angular.z == 0:
                    most_recent_cmd_vel = None
            elif topic == "/camera/image_raw/compressed" and most_recent_cmd_vel is not None:
                np_arr = np.fromstring(msg.data, np.uint8)
                cv_image = cv2.imdecode(np_arr, cv2.CV_LOAD_IMAGE_COLOR)
                image_data.append(extract_data(cv_image))
                cmd_vel_data.append(twist_to_nparray(most_recent_cmd_vel))

    clf.fit(image_data, cmd_vel_data)
    return clf
コード例 #2
0
ファイル: test_brr.py プロジェクト: GunnarEcon/fancyimpute
def test_brr_like_sklearn():
    n = 10000
    d = 10
    sigma_sqr = 5
    X = np.random.randn(n, d)
    beta_true = np.random.random(d)
    y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n)
    X_tr = X[:n / 2, :]
    y_tr = y[:n / 2]
    X_ts = X[n / 2:, :]
    #  y_ts = y[n / 2:]

    # prediction with my own bayesian ridge
    lambda_reg = 1
    brr = BayesianRidgeRegression(lambda_reg,
                                  add_ones=True,
                                  normalize_lambda=False)
    brr.fit(X_tr, y_tr)
    y_ts_brr = brr.predict(X_ts)

    # let's compare to scikit-learn's ridge regression
    rr = Ridge(lambda_reg)
    rr.fit(X_tr, y_tr)
    y_ts_rr = rr.predict(X_ts)

    assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \
        "Predictions are different from sklearn's ridge regression."
コード例 #3
0
ファイル: test_sag.py プロジェクト: AlexisMignon/scikit-learn
def test_sag_regressor_computed_correctly():
    """tests if the sag regressor is computed correctly"""
    alpha = .1
    n_features = 10
    n_samples = 40
    max_iter = 50
    tol = .000001
    fit_intercept = True
    rng = np.random.RandomState(0)
    X = rng.normal(size=(n_samples, n_features))
    w = rng.normal(size=n_features)
    y = np.dot(X, w) + 2.
    step_size = get_step_size(X, alpha, fit_intercept, classification=False)

    clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag',
                 alpha=alpha * n_samples, max_iter=max_iter)
    clf2 = clone(clf1)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha,
                                          n_iter=max_iter,
                                          dloss=squared_dloss,
                                          fit_intercept=fit_intercept)

    spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha,
                                          n_iter=max_iter,
                                          dloss=squared_dloss, sparse=True,
                                          fit_intercept=fit_intercept)

    assert_array_almost_equal(clf1.coef_.ravel(),
                              spweights1.ravel(),
                              decimal=3)
    assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
コード例 #4
0
def reg_skl_ridge(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    ridge = Ridge(alpha=param["alpha"], normalize=True)
    ridge.fit(X_tr, y_reg_tr)
    pred = ridge.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
コード例 #5
0
    def fit(self, X, Y, weights=None, context_transform=True):
        """ Trains policy by weighted maximum likelihood.

        .. note:: This call changes this policy (self)

        Parameters
        ----------
        X: array-like, shape (n_samples, context_dims)
            Context vectors

        Y: array-like, shape (n_samples, weight_dims)
            Low-level policy parameter vectors

        weights: array-like, shape (n_samples,)
            Weights of individual samples (should depend on the obtained
            reward)
        """
        # Kernel approximation
        self.nystroem = Nystroem(
            kernel=self.kernel,
            gamma=self.gamma,
            coef0=self.coef0,
            n_components=np.minimum(X.shape[0], self.n_components),
            random_state=self.random_state,
        )
        self.X = self.nystroem.fit_transform(X)
        if self.bias:
            self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))
        if self.normalize:
            self.X /= np.abs(self.X).sum(1)[:, None]

        # Standard ridge regression
        ridge = Ridge(alpha=self.alpha, fit_intercept=False)
        ridge.fit(self.X, Y, weights)
        self.W = ridge.coef_
コード例 #6
0
ファイル: test_sag.py プロジェクト: AlexisMignon/scikit-learn
def test_regressor_matching():
    n_samples = 10
    n_features = 5

    rng = np.random.RandomState(10)
    X = rng.normal(size=(n_samples, n_features))
    true_w = rng.normal(size=n_features)
    y = X.dot(true_w)

    alpha = 1.
    n_iter = 100
    fit_intercept = True

    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
    clf = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag',
                alpha=alpha * n_samples, max_iter=n_iter)
    clf.fit(X, y)

    weights1, intercept1 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter,
                                      dloss=squared_dloss,
                                      fit_intercept=fit_intercept)
    weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter,
                               dloss=squared_dloss,
                               fit_intercept=fit_intercept)

    assert_array_almost_equal(weights1, clf.coef_, decimal=10)
    assert_array_almost_equal(intercept1, clf.intercept_, decimal=10)
    assert_array_almost_equal(weights2, clf.coef_, decimal=10)
    assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
コード例 #7
0
ファイル: test_sag.py プロジェクト: AlexisMignon/scikit-learn
def test_sag_pobj_matches_ridge_regression():
    """tests if the sag pobj matches ridge reg"""
    n_samples = 100
    n_features = 10
    alpha = 1.0
    n_iter = 100
    fit_intercept = False
    rng = np.random.RandomState(10)
    X = rng.normal(size=(n_samples, n_features))
    true_w = rng.normal(size=n_features)
    y = X.dot(true_w)

    clf1 = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag',
                 alpha=alpha, max_iter=n_iter, random_state=42)
    clf2 = clone(clf1)
    clf3 = Ridge(fit_intercept=fit_intercept, tol=.00001, solver='lsqr',
                 alpha=alpha, max_iter=n_iter, random_state=42)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)
    clf3.fit(X, y)

    pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss)
    pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss)
    pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss)

    assert_array_almost_equal(pobj1, pobj2, decimal=4)
    assert_array_almost_equal(pobj1, pobj3, decimal=4)
    assert_array_almost_equal(pobj3, pobj2, decimal=4)
コード例 #8
0
ファイル: scorer.py プロジェクト: jonpiffle/amr_discourse
class OrderScorer(Scorer):

    def __init__(self):
        self.classifier = Ridge(alpha=0.1)
        self.cache_filename = 'subgraph_order_scorer_reg.pickle'

    def train(self, train_instances, train_labels, update_cache=True,
              sample_weight=None):
        """
        Trains a scorer to score the quality of an ordering of sentences
        Loads from cache if available
        """
        self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight)
        if update_cache:
            pickle.dump(self.classifier, open(self.cache_filename, 'wb'))

    def test(self, test_instances, test_labels):
        """ Uses test set to evaluate the performance of the scorer and print it out """
        scores = self.classifier.predict(test_instances)
        # TODO: print report

    def load(self):
        if os.path.exists(self.cache_filename):
            self.classifier = pickle.load(open(self.cache_filename, 'rb'))
        else:
            raise Exception("No classifier exists! Must call train with update_cache=True") 

    def evaluate(self, test_instance):
        """ Applies the scoring function to a given test instance """
        return self.classifier.predict([test_instance])[0]
コード例 #9
0
    def _make_forecast(self, model, name, alpha=None, l1_ratio=None):
        """
        Output: DataFrame

        Train on the holdout set and make predictions for the next week
        """
        X_hold = self.hold_set[self.hold_set.columns[1:]]
        if 'lyft' in self.filename:
            y_hold = self.hold_set['avg_est_price']
        else:
            y_hold = self.hold_set['avg_price_est']
        if name.split("_")[0] == "ridgecv":
            model = Ridge(alpha=alpha)
        elif name.split("_")[0] == "lassocv":
            model = Lasso(alpha=alpha)
        elif name.split("_")[0] == "elasticnetcv":
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        model.fit(X_hold, y_hold)
        self.X_forecast = X_hold.copy()
        # assumes weekofyear is increasing
        self.X_forecast['weekofyear'] = self.X_forecast['weekofyear'].apply(lambda x: x+1)
        self.X_forecast.index = self.X_forecast.index + pd.Timedelta(days=7)
        self.y_forecast = model.predict(self.X_forecast)
        self.y_forecast = pd.DataFrame(self.y_forecast, index=self.X_forecast.index, columns=['y_forecast'])
        self.y_forecast = pd.concat([self.X_forecast, self.y_forecast], axis=1)
        saved_filename = "rideshare_app/data/{}_forecast.csv".format(name)
        self.y_forecast.to_csv(saved_filename)
        print "saved prediction values to {}".format(saved_filename)
コード例 #10
0
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50):
    # X and Y are numpy arrays
    print 'Input data and label shape: ', X.shape, Y.shape

    if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor)

    model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim)
    Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose())


    regressors = []
    for i in range(dim):
        print 'at regressor number: ', i
        reg = Ridge() if regressor=='ridge' else SVR()
        y = [x[i] for x in Y_train]
        reg.fit(X, y)
        regressors.append(reg)

    Z_pred = []
    for reg in regressors:
        Z_pred.append(reg.predict(X_test))
    print 'prediction shapes:' , len(Z_pred), len(Z_pred[0])
    Z_pred = np.array(Z_pred)
    Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P)
    return model, regressors, Y_pred
コード例 #11
0
ファイル: estimators.py プロジェクト: Patechoc/labs-untested
class LogisticRegressionSeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets predict which users will spend anything later
        classes = y - X[:, 0]
        classes = np.where(classes > 0.1, 1, 0)

        self.classifier = LogisticRegression(
                class_weight='balanced')

        self.classifier.fit(X, classes)
        results = self.classifier.predict(X)
        results = results == 1

        self.estimator = Ridge(alpha=0.05)
        self.estimator.fit(X[results], y[results])

    def predict(self, X):
        y = X[:,0].reshape(X.shape[0])
        labels = (self.classifier.predict(X) == 1)
        y[labels] = self.estimator.predict(X[labels])
        return y
コード例 #12
0
ファイル: scikit.py プロジェクト: niangaotuantuan/webmining
def train_single_model(train_data, train_labels, algo):
	"""
	Train the model for a single label dimension
	"""
	if algo == 'svr_rbf':
		"""
		SVM regression, RBF kernel
		"""
		svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
		svr_rbf.fit(train_data, train_labels)
		return svr_rbf

	if algo == 'svr_lin':
		"""
		SVM regression, linear
		"""
		svr_lin = SVR(kernel='linear')
		svr_lin.fit(train_data, train_labels)
		return svr_lin

	if algo == 'ridge':
		"""
		Ridge regression
		"""
		clf = Ridge(alpha = 0.5)
		clf.fit(train_data, train_labels)
		return clf

	# No hit algorithm
	print "unimplemented model type"
	return None
コード例 #13
0
 def regression_weight(self, matched_data):
     converted_data = {}
     for i, data in enumerate(matched_data):
         if i==0:
             for key in data.keys():
                 try:
                     value = float(data[key])
                     converted_data[key] = [value]
                 except ValueError:
                     pass
         else:
             for key in data.keys():
                 if key in converted_data:
                     converted_data[key].append(float(data[key]))
     sorted_key = sorted(converted_data.keys())
     input_key = [key for key in sorted_key if key != self.main_key.lower()]
     x = [] 
     for key in input_key:
         # normalization
         numpy_data = normalization(np.array(converted_data[key]))      
         x.append(numpy_data)
     x = np.array(x).T
     y = normalization(np.array(converted_data[self.main_key.lower()]))
     regressor = Ridge(alpha=1.0, normalize=True)
     regressor.fit(x,y)
     sorted_result = np.array(input_key)[np.argsort(np.array(regressor.coef_))]
     sorted_result = sorted_result[::-1]
     coefficient = sorted(regressor.coef_, reverse = True)
     return [(sorted_result[i], coefficient[i]) for i in range(len(sorted_result))]
コード例 #14
0
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
   :param train_x: train
   :param train_y: text
   :param pred_x: test set to predict
   :param review_id: takes in a review id
   :param v_curve: run the model for validation curve
   :param l_curve: run the model for learning curve
   :param get_model: run the model
   :return:the predicted values,learning curve, validation curve
   """
    lin = Ridge(alpha=0.5)
    if get_model:
        print "Fitting Ridge..."
        lin.fit(train_x, np.log(train_y+1))
        gbr_pred = np.exp(lin.predict(pred_x))- 1
        for i in range(len(gbr_pred)):
            if gbr_pred[i] < 0:
                gbr_pred[i] = 0
        Votes = gbr_pred[:, np.newaxis]
        Id = np.array(review_id)[:, np.newaxis]
        submission_lin= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0),
                              param_name="alpha", param_range=[0.1,0.2,0.5,1,10])
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
コード例 #15
0
def ridgeRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Ridge Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ridgeRegression = Ridge(alpha=1e-11,solver="cholesky")
    ridgeRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = ridgeRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-ridgeRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
コード例 #16
0
ファイル: knn.py プロジェクト: jefftn/kaggle-twitter
def knn_twice(k):
	knn1 = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn1.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = len(train)/1000
	pred1 = []
	for i in range(0,n):
		pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))]))
		print(i)
	pred1.extend(knn1.predict(trainf[67000:67946]))
	print "time: " + str(time.time() - tim)
	#knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	#knn.fit(pred1,trainlab)
	ridge = Ridge(alpha=1.0)
	ridge.fit(pred1, trainlab)

	n = 10
	pred2 = []
	for i in range(0,n):
		pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray()))
		print(i)	

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))]))
		print(i)	

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	return err
コード例 #17
0
def forecast_future_attention(train_index, test_index, alpha):
    """Forecast future attention via train dataset index and test dataset index."""
    m, n = len(train_index), len(test_index)
    x_train_predict = attention_data[train_index, :num_train]
    x_test_predict = attention_data[test_index, :num_train]
    for i in xrange(num_train, age):
        if with_share == 1:
            x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1]))
            x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1]))
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1]))
        else:
            x_train = x_train_predict
            x_test = x_test_predict
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1)))
        x_train_norm = x_train / np.sum(norm, axis=1)[:, None]
        y_train = np.ones(m, )

        # == == == == == == == == Training with Ridge Regression == == == == == == == == #
        predictor = Ridge(fit_intercept=False, alpha=alpha)
        predictor.fit(x_train_norm, y_train)

        # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == #
        predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1)
        predict_train_value[predict_train_value < 0] = 0
        x_train_predict = np.hstack((x_train_predict, predict_train_value))
        predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1)
        predict_test_value[predict_test_value < 0] = 0
        x_test_predict = np.hstack((x_test_predict, predict_test_value))
    return x_test_predict[:, num_train: age]
コード例 #18
0
ファイル: resourcesModel.py プロジェクト: pkravik/kaggle
def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'):
    print "Bag of words for %s" % (textSeries.name)
    
    if typeModel == 'continuous':
        bowModel = Ridge(alpha = 0.001)
    else:
        bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) 
    
    
    vectorizer = getFeatures(textSeries)
    
    X_train = vectorizer.transform(predictData)
        
    #Outcomes
    Y_train = outcome
    
    #Logistic regression, not sure if best
    bowModel.fit(X_train,Y_train)
    
    #Comment out later, fitting on CV data
    
    if typeModel == 'continuous':
        predict = bowModel.predict(X_train)
        yhat = predict
    else:
        predict = bowModel.predict_proba(X_train)
        yhat = predict[:,1]
    
    
    return (yhat, vectorizer, bowModel)
コード例 #19
0
ファイル: ridge1.py プロジェクト: daxiongshu/bnp
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict(X_test_cv)
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
コード例 #20
0
ファイル: RidgePredict.py プロジェクト: BrandynD/CyVerse
 def RidgeRegression(self,filename,outputFile):
     pheno,geno = self.inputParse(filename)
     for row in geno:
                 if len(row)%2 !=0:
                         return "Rows are not even."
     maxGeno = max(geno)
     allGeno = list(set(maxGeno))
     encoder = [i for i in range(len(allGeno))]
     lengthGeno = len(geno)
     length = len(geno)
     lenInnerGeno = len(geno[0])
     genoMake = [0 for x in range(len(allGeno))]
     dictionary = dict(zip(allGeno,encoder))
     for i in range(length):
             for x in range(lenInnerGeno):
                     geno[i][x] = dictionary[geno[i][x]]
     phenoNaN = []
     for i in range(len(pheno)):
         if pheno[i] == 'NaN':
             phenoNaN.append(i)
     phenoNaN.reverse()
     for i in phenoNaN:
         del pheno[i]
     genoMiss = []
     for i in range(len(geno)):
         if i not in phenoNaN:
             genoMiss.append(geno[i])
     pheno = [float(i) for i in pheno]    
     alpha = self.alphaOptimization(genoMiss,pheno)
     clf = Ridge(alpha = alpha)
     clf.fit(genoMiss,pheno)
     predicted = clf.predict(geno)
     predicted = np.transpose(predicted)
     np.savetxt(outputFile,np.transpose(predicted))
コード例 #21
0
def cross_valid(X,Y,n_fold):
	clf = Ridge(alpha=1.0)
	total_mean_square = 0
	total_coef = 0
	Y_np = np.array(Y)
	n_samples, n_features = len(X), len(X[0])
	kf_Y = cross_validation.KFold(n_samples, n_fold)
	index = []
	preds = []
	truths = []
	for train_index, test_index in kf_Y:
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = Y_np[train_index], Y_np[test_index]
		

		clf.fit(X_train,y_train)
		y_pred = clf.predict(X_test)
		index += test_index.tolist()
		preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist())
		truths += y_test.tolist()
		#print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred)
		#print "original:",y_test

		total_mean_square += mean_squared_error(y_test,y_pred) 
		total_coef += clf.coef_
	
		#print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) 
	print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) 
	print 'Average mean squared error is: ' , total_mean_square / n_fold

	diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)])
	acc =  100-1.* diff_count/len(truths)*100
	print 'prediction accuracy is %f'%(acc)
	return [total_coef, index , preds]
コード例 #22
0
ファイル: strfs.py プロジェクト: choldgraf/LaSP
def fit_strf_ridge(input, output, lags, alpha=1.0, verbose=False):

    #convert the input into a toeplitz-like matrix
    if verbose:
        nt,nf = input.shape
        nelems = nt*nf*len(lags)
        mem = (nelems*8.) / 1024.**2
        print '[fit_strf_ridge] estimated size of toeplitz matrix: %d MB' % mem
    stime = time.time()
    A = make_toeplitz(input, lags, include_bias=False)
    etime = time.time() - stime
    if verbose:
        print '[fit_strf_ridge] Time to make Toeplitz matrix: %d seconds' % etime

    #fit the STRF
    stime = time.time()

    #rr = Ridge(alpha=alpha, copy_X=False, fit_intercept=True)
    rr = Ridge(alpha=alpha, fit_intercept=True)
    rr.fit(A, output)
    etime = time.time() - stime
    if verbose:
        print '[fit_strf_ridge] Time to fit STRF: %d seconds' % etime

    #reshape the STRF so that it makes sense
    nt = input.shape[0]
    nf = input.shape[1]
    d = len(lags)
    strf = np.array(rr.coef_).reshape([nf, d])
    bias = rr.intercept_

    return strf,bias
コード例 #23
0
ファイル: models.py プロジェクト: nhu2000/PriceMyRental
def ridge_regressor(df):
    """
    INPUT: Pandas dataframe
    OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients
    """
    y = df.pop("price").values
    X = df.values
    feature_names = df.columns
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

    clf = Ridge(alpha=1.0)
    clf.fit(xtrain, ytrain)

    score = clf.score(xtest, ytest)
    feat_imps = clf.coef_
    ypredict = clf.predict(xtest)
    mae = np.mean(np.absolute(ytest - ypredict))
    mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest)
    return (
        "R^2 is ",
        score,
        "RMSE is ",
        rmse,
        "MAE percent is ",
        mae_percent,
        "Feature coefficients are ",
        zip(feature_names, feat_imps),
    )
コード例 #24
0
def Ridge_model(train_linear, test_linear):
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(train_linear_fea, train_linear_tar)
    ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    #ridge.set_params(alpha=6,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl')
    return test_prediction_ridge
    
    
コード例 #25
0
ファイル: ransac.py プロジェクト: Ambier/arsenal
class RidgeRegressionModel(LinearLeastSquaresModel):
    def __init__(self, input_columns, output_columns, debug=False):

        self.alpha = 0.0000000001
        self.m = Ridge(alpha=self.alpha)

        super(RidgeRegressionModel, self).__init__(input_columns, output_columns, debug=debug)

    def fit(self, data):
        A = numpy.vstack([data[:,i] for i in self.input_columns]).T
        B = numpy.vstack([data[:,i] for i in self.output_columns]).T

        self.m.fit(A, B)

        return self.m.coef_   #m.intercept_

    def get_error(self, data, model):
        A = numpy.vstack([data[:,i] for i in self.input_columns]).T
        B = numpy.vstack([data[:,i] for i in self.output_columns]).T
        B_fit = scipy.dot(A, model)
        err_per_point = numpy.sum((B-B_fit)**2, axis=1) # sum squared error per row

        norm = numpy.sqrt(model*model)
        assert norm.shape == (1,1)
        regularizer = 1.0*norm[0,0]

        return err_per_point - regularizer
コード例 #26
0
ファイル: problem5.py プロジェクト: eprym/EE-239AS
def ridge_regression(data,target,alphas):
    plt.figure()
    mean_rmses=[]
    kf=KFold(len(target),10,True,None)
    for alpha0 in alphas:
        rmses=[]
        clf=Ridge(alpha=alpha0,normalize=True,solver='svd')
        for train_index, test_index in kf:
            data_train,data_test=data[train_index],data[test_index]
            target_train,target_test=target[train_index],target[test_index]
            clf.fit(data_train,target_train)
            rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
            rmses.append(rmse)
            
        mean_rmses.append(np.mean(rmses))
        x0=np.arange(1,11)
        plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o')
        
    lr = linear_model.LinearRegression(normalize = True)
    rmses = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        rmses.append(rmse)
    mean_rmses.append(np.mean(rmses))
    x0=np.arange(1,11)
    plt.plot(x0,rmses,label='linear',marker='*')
    
    plt.title("RMSE comparison between different alpha values of Ridge regularization")
    plt.legend()
    plt.show()
#    print(mean_rmses)
    return mean_rmses
コード例 #27
0
ファイル: model.py プロジェクト: nOkuda/reviewlabeljob
def _check_ridge_model(featureses, labels):
    """Plot ridge regression predictions"""
    for tfidf_count in FEATURES_SIZES:
        test_points = []
        for i in range(16):
            tmp = [i, 100]
            tmptmp = [0] * tfidf_count
            if tmptmp:
                tmp.extend(tmptmp)
            test_points.append(tmp)
        test_points = np.array(test_points)
        limit = tfidf_count + 2
        model = Ridge()
        model.fit(featureses[:, :limit], labels)
        predictions = model.predict(test_points)
        plt.plot(
            predictions,
            label=str(tfidf_count),
            linestyle=next(LINECYCLER),
            linewidth=3)
        # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count))
    plt.legend()
    plt.xlabel('Document order')
    plt.ylabel('Time (seconds)')
    plt.savefig('ridge_predictions.pdf')
コード例 #28
0
ファイル: onlinelearn.py プロジェクト: imclab/predictimdb
def traverse_movies_ridge():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 100:
			model = Ridge(alpha = .5)
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			#P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	#print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
コード例 #29
0
def compute_linear_model(mfs, measures):
    from sklearn.linear_model import Ridge
    from sklearn import linear_model

    # try different ones
    clf = Ridge(alpha = 1.0)
    #clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
    #clf = linear_model.LinearRegression()

    # explain fexp using BMD + the MFS data
    fexp = measures[:, measures.shape[1]-1]

    bmd = measures[:, 0]
    bmd = bmd.reshape((bmd.shape[0], 1))

    #print "BMD: ", bmd
    #print "FEXP: ", fexp
    #print "MFS; ", mfs

    #PCA
    #from sklearn.decomposition import PCA
    #pca = PCA(n_components=12)
    #pca.fit(mfs)
    #mfs_pca = pca.transform(mfs)

    X = np.hstack((bmd, mfs))
    clf.fit(X, fexp)

    # Results
    #print "Coefs:", clf.coef_
    print "Score (R^2):", clf.score(X, fexp)
コード例 #30
0
ファイル: scratch.py プロジェクト: mdozmorov/genomerunner_web
def impute_age():
    X, P = gfa.platform_expression("GPL96")
    model = impute.KNNImputer()
    Xi = model.fit_transform(X, axis=1)

    age = array(P["age"].tolist())
    Xm = Xi.as_matrix()
    ix = array((age >= 10) & (age <= 120)).nonzero()[0]
    np.random.shuffle(ix)
    Xm = Xm[ix, :]
    age = age[ix]

    n_train = 2000
    n_test = 500
    # clf = SVR(C=1e-5, epsilon=1)
    # clf = LinearRegression()
    clf = Ridge()
    # clf = SimpleRegressor()
    # clf = Lasso()
    clf.fit(Xm[:n_train, :], age[:n_train])
    y = age[n_train : (n_train + n_test)]
    y_hat = clf.predict(Xm[n_train : (n_train + n_test)])
    dy = y - y_hat

    bias_tr = y_hat.mean() - age.mean()
    print("\nBias (vs train):\t\t", bias_tr)
    print("Bias (vs test):\t\t\t", dy.mean())
    print("Mean error:\t\t\t", fabs(dy).mean())
    print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean())
    print("MSE:\t\t\t\t", np.power(dy, 2).mean())
コード例 #31
0
    mask[(points[0]).astype(np.int), (points[1]).astype(np.int)] = 1
    mask = ndimage.gaussian_filter(mask, sigma=l / n_pts)
    res = np.logical_and(mask > mask.mean(), mask_outer)
    return np.logical_xor(res, ndimage.binary_erosion(res))


# Generate synthetic images, and projections
l = 128
proj_operator = build_projection_operator(l, l // 7)
data = generate_synthetic_data()
proj = proj_operator * data.ravel()[:, np.newaxis]
proj += 0.15 * np.random.randn(*proj.shape)

# Reconstruction with L2 (Ridge) penalization
rgr_ridge = Ridge(alpha=0.2)
rgr_ridge.fit(proj_operator, proj.ravel())
rec_l2 = rgr_ridge.coef_.reshape(l, l)

# Reconstruction with L1 (Lasso) penalization
# the best value of alpha was determined using cross validation
# with LassoCV
rgr_lasso = Lasso(alpha=0.001)
rgr_lasso.fit(proj_operator, proj.ravel())
rec_l1 = rgr_lasso.coef_.reshape(l, l)

plt.figure(figsize=(8, 3.3))
plt.subplot(131)
plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest')
plt.axis('off')
plt.title('original image')
plt.subplot(132)
house_lasso_reg = Lasso(alpha=1.0, max_iter=100000, normalize=True, tol=0.0001)
house_lasso_reg = house_lasso_reg.fit(house_train.drop("SalePrice", axis=1),
                                      house_train["SalePrice"])
# Predict using the model
house_lasso_pred = house_lasso_reg.predict(house_test.drop("SalePrice",
                                                           axis=1))

# In[ ]:

#L2 (Ridge) Regularization
house_ridge_reg = Ridge(alpha=1.0,
                        max_iter=100000,
                        normalize=True,
                        solver='lsqr',
                        tol=0.001)
house_ridge_reg = house_ridge_reg.fit(house_train.drop("SalePrice", axis=1),
                                      house_train["SalePrice"])
#Predict using the model
house_ridge_pred = house_ridge_reg.predict(house_test.drop("SalePrice",
                                                           axis=1))

#####################
###Cross-Validation Ridge
house_ridge_CV_reg = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0),
                             normalize=True,
                             cv=10)
house_ridge_CV_reg = house_ridge_CV_reg.fit(
    house_train.drop("SalePrice", axis=1), house_train["SalePrice"])
# Predict using the model
house_ridge_CV_reg_pred = house_ridge_CV_reg.predict(
    house_test.drop("SalePrice", axis=1))
コード例 #33
0
    scores_mse_ridge_scikit_train = []
    scores_r2_ridge_scikit_train = []
    scores_mse_ridge_scikit_val = []
    scores_r2_ridge_scikit_val = []
    scores_mse_ridge_scikit_test = []
    scores_r2_ridge_scikit_test = []

    alphas = [1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0]

    for alpha in alphas:

        # Initialize scikit-learn ridge regression model
        model_ridge_scikit = RidgeRegression(alpha=alpha)

        # Trains scikit-learn ridge regression model
        model_ridge_scikit.fit(x_poly_train, y_train)

        print('Results for scikit-learn RidgeRegression model with alpha={}'.
              format(alpha))

        # Test model on training set
        score_mse_ridge_scikit_train = score_mean_squared_error(
            model_ridge_scikit, x_poly_train, y_train)
        print('Training set mean squared error: {:.4f}'.format(
            score_mse_ridge_scikit_train))

        score_r2_ridge_scikit_train = model_ridge_scikit.score(
            x_poly_train, y_train)
        print('Training set r-squared scores: {:.4f}'.format(
            score_r2_ridge_scikit_train))
コード例 #34
0

if __name__ == "__main__":
    x, y = GetData_x_y('resources/abalone.txt')
    weights = ridgeTest(x, y)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.xlabel('log(lambda)')
    plt.ylabel('regression coff')
    x_range = [i - 10 for i in range(numTestpts)]
    ax.plot(x_range, weights)
    plt.show()

    print('***********用sklearn库的岭回归进行拟合****************')
    clf = Ridge(alpha=.5)
    clf.fit(x, y)
    print(clf.coef_)
    print(clf.intercept_)
    print(
        clf.predict(
            np.array([1, 0.455, 0.365, 0.095, 0.514, 0.2245, 0.101, 0.15, 1])))

    print("******************用自己编写的岭回归代码进行拟合******************")
    print(weights[15][0:len(weights[0]) - 1])
    print(weights[15][-1])
    print(
        np.dot(
            weights[15],
            np.array([1, 0.455, 0.365, 0.095, 0.514, 0.2245, 0.101, 0.15,
                      1]).T))
コード例 #35
0
dict_vect_matrix = dict_vect.fit_transform(lal)
print(dict_vect_matrix, dict_vect_matrix.toarray(), sep='\n\n')
vectorizer_feats = DictVectorizer()
X_train_feats = vectorizer_feats.fit_transform(
    X_train[feats].fillna('-').T.to_dict().values())
X_valid_feats = vectorizer_feats.transform(
    X_valid[feats].fillna('-').T.to_dict().values())
X_test_feats = vectorizer_feats.transform(
    X_test[feats].fillna('-').T.to_dict().values())

X_train_new = scipy.sparse.hstack(
    (X_train_title, X_train_title2, X_train_feats))
X_valid_new = scipy.sparse.hstack(
    (X_valid_title, X_valid_title2, X_valid_feats))
X_test_new = scipy.sparse.hstack((X_test_title, X_test_title2, X_test_feats))
'''model1 = Ridge(alpha=0.1, random_state=1)
model1.fit(X_train_new, y_train)
train_pred1 = model1.predict(X_train_new)
valid_pred1 = model1.predict(X_valid_new)
print(mean_squared_error(y_train, train_pred1), mean_squared_error(y_valid, valid_pred1))

model2 = Ridge(alpha=1.0, random_state=1)
model2.fit(X_train_new, y_train)
train_pred2 = model2.predict(X_train_new)
valid_pred2 = model2.predict(X_valid_new)
print(mean_squared_error(y_train, train_pred2), mean_squared_error(y_valid, valid_pred2))
'''
model = Ridge(random_state=17)
train_data = scipy.sparse.vstack((X_train_new, X_valid_new))
model.fit(train_data, y)
print(mean_squared_error(y_valid, model.predict(X_valid_new)))
コード例 #36
0
def regression(trainfile,
               testfile,
               resultsfile,
               learner,
               weightdata=False,
               writefile=True):
    start_time = time.time()
    X, y = readdata2(False, trainfile)
    X = np.array(X)
    y = np.array(y)
    #print X,y
    bestalpha = -1
    bestscore = -1e200
    bestalphaSE = -1
    bestscoreSE = 1e200
    ncv = 10
    if (len(y) < 20):
        ncv = len(y)

    if (learner == 'Ridge' or learner == 'Lasso'):
        alphalist = np.append(np.logspace(-7, 1, 10), [0])
    elif (learner == 'BayesianRidge'):
        alphalist = [0]

    for alpha in alphalist:
        kf = KFold(len(y), n_folds=ncv)
        nd = 0
        MSE_v = 0.0
        loglkl = 0.0
        loglkl1 = 0.0
        for train_index, test_index in kf:
            X_train, X_v = X[train_index], X[test_index]
            y_train, y_v = y[train_index], y[test_index]
            if weightdata:
                cond = np.abs(X_train[:, 0] - y_train) < 0.004
                print float(np.count_nonzero(cond)) / y_train.shape[0]
            if (learner == 'Ridge'):
                reg1 = Ridge(alpha=alpha)
            elif (learner == 'BayesianRidge'):
                reg1 = BayesianRidge()
            elif (learner == 'Lasso'):
                reg1 = Lasso(alpha=alpha)
            reg1.fit(X_train, y_train)
            predytrain = reg1.predict(X_train)

            predy_v = reg1.predict(X_v)
            MSE_v += np.dot(np.array(predy_v - y_v),
                            np.array(predy_v - y_v))  #/float(len(y_v))
            if (learner != 'BayesianRidge'):
                STD = math.sqrt(
                    np.dot(np.array(y_train - predytrain),
                           np.array(y_train - predytrain)) /
                    float(len(predytrain) - 1.0)) + 1e-12  # to avoid problems
                loglkl += loglklnormal(y_v, predy_v, STD)
        if (learner != 'BayesianRidge'):
            if (loglkl > bestscore):
                bestscore = loglkl
                bestalpha = alpha

        if (MSE_v < bestscoreSE):
            bestscoreSE = MSE_v
            bestalphaSE = alpha
    #print bestalpha,bestscore
    #print bestalphaSE,bestscoreSE

    # retrain on all the dataset
    if (learner == 'Ridge'):
        reg = Ridge(alpha=bestalpha)
    elif (learner == 'BayesianRidge'):
        reg = BayesianRidge(compute_score=True)
    elif (learner == 'Lasso'):
        reg = Lasso(alpha=bestalpha)
    #reg=Ridge(alpha=bestalpha)
    reg.fit(X, y)
    predy = reg.predict(X)
    vartrain = np.dot(np.array(predy - y),
                      np.array(predy - y)) / float(len(y) - 1.0) + 1e-12
    #print vartrain
    Xtest, ytest = readdata2(False, testfile)

    SEtest = 0.0
    RMSEtest = 0.0
    loglklTest = 0

    if len(ytest) > 0:
        ypred = reg.predict(Xtest)
        SEtest = np.dot(np.array(ypred - ytest), np.array(ypred - ytest))
        print SEtest / float(len(ytest))
        my = sum(ytest) / float(len(ytest))
        vv = [(yy - my) * (yy - my) for yy in ytest]
        #    print (1-reg.score(Xtest, ytest))*sum(vv)
        RMSEtest = math.sqrt(SEtest / float(len(ytest)))
        loglklTest = loglklnormal(ytest, ypred, math.sqrt(vartrain))
    #print bestscore,loglklTest
    #print reg.alpha_
    #print reg.intercept_,reg.coef_
    #print "HERE"
    if writefile:
        param = open(resultsfile, 'w')
        #print "Writing to: ",resultsfile
        param.write("bias and coefficients,")
        param.write(str(reg.intercept_))
        for c in reg.coef_:
            param.write("," + str(c))
        param.write("\n")
        param.write("STD train,")
        param.write(str(math.sqrt(vartrain)))
        param.write("\n")
        param.write("sum loglikelihood CV train,")
        if (learner == 'BayesianRidge'):
            param.write(str(reg.scores_[-1]))
        else:
            param.write(str(bestscore))
        param.write("\n")
        param.write("sum loglikelihood test,")
        param.write(str(loglklTest))
        param.write("\n")
        param.write("sum squared error CV train,")
        param.write(str(bestscoreSE))
        param.write("\n")
        param.write("sum squared error test,")
        param.write(str(SEtest))
        param.write("\n")
        print RMSEtest
        # param.write("Root MSE (STD) CV train,")
        # param.write(str(math.sqrt(bestscore)))
        # param.write("\n")
        # param.write("squared error sum (score) CV train,")
        # param.write(str(bestscore*len(y)))
        # param.write("\n")
        # param.write("squared error sum test,"+ str(SEtest))
        # param.write("\n")
        # param.write("Root MSE test,"+ str(RMSEtest))
        # param.write("\n")
        param.write(str(type(reg)) + ",")
        param.write(str(bestalpha) + ",")
        param.write(str(bestalphaSE))
        param.close()
        print("--- %s seconds ---" % (time.time() - start_time))
        # save model
        #joblib.dump(reg, 'model.pkl')
        #clf = joblib.load('model.pkl')
    return reg
コード例 #37
0
# --------------
from sklearn.linear_model import Lasso

# Code starts here
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
r2_lasso = lasso.score(X_test, y_test)
print(r2_lasso)

# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
r2_ridge = ridge.score(X_test, y_test)
print(r2_ridge)
# Code ends here

# --------------
from sklearn.model_selection import cross_val_score

#Code starts here
regressor = LinearRegression()

# Initiate cross validation score
score = cross_val_score(regressor, X_train, y_train, scoring='r2', cv=10)
print(score)
#calculate mean of the score
コード例 #38
0
        IQR = test_Q3 - test_Q1
        exist_outlier = [
            str(x[0]) for x in (temp_y < (test_Q1 - 1.5 * IQR))
            | (temp_y > (test_Q3 + 1.5 * IQR))
        ]
        if 'True' in exist_outlier:
            temp_x = temp_x[~((temp_y < (test_Q1 - 1.5 * IQR)) |
                              (temp_y >
                               (test_Q3 + 1.5 * IQR)))].reshape(-1, 1)
            temp_y = temp_y[~((temp_y < (test_Q1 - 1.5 * IQR)) |
                              (temp_y >
                               (test_Q3 + 1.5 * IQR)))].reshape(-1, 1)

        # Build a Ridge Regressor
        temp_Ridge = Ridge(alpha=k, normalize=True)
        temp_Ridge.fit(temp_x, temp_y)
        temp_y_pred = temp_Ridge.predict(temp_x)
        Ridge_predict = temp_Ridge.predict(Year).reshape(len(Year), ).tolist()
        incor_pred = sum(1 for pred in Ridge_predict if pred < 0)
        Ridge_vary_alpha[k].append({
            'areaId':
            areaId,
            'mse':
            metrics.mean_squared_error(temp_y, temp_y_pred),
            'r2':
            metrics.r2_score(temp_y, temp_y_pred),
            'num_incor':
            incor_pred
        })

num_incor_alpha = defaultdict(int)
コード例 #39
0
plt.figure(figsize=(8, 4))
plt.subplot(121)
plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(122)
plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)

save_fig("ridge_regression_plot")
plt.show()
print()

from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
ridge_reg.fit(X, y)
print('ridge_reg.predict([[1.5]]) = {0}'.format(ridge_reg.predict([[1.5]])))
print()

sgd_reg = SGDRegressor(max_iter=50,
                       tol=-np.infty,
                       penalty="l2",
                       random_state=42)
sgd_reg.fit(X, y.ravel())
print('ridge_reg.predict([[1.5]]) = {0}'.format(ridge_reg.predict([[1.5]])))
print()

from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1, solver="sag", random_state=42)
ridge_reg.fit(X, y)
コード例 #40
0
def part2():
    scaler = preprocessing.StandardScaler()

    #Splitting the training and testing data
    train_ratio = 0.3
    num_rows = data.shape[0]
    train_set_size = int(num_rows * train_ratio)

    train_data = data.iloc[:train_set_size]
    test_data = data.iloc[train_set_size:]

    train_features = train_data.drop(['TARGET_D'], axis=1, inplace=False)
    standardize_train_features = scaler.fit_transform(train_features)
    train_features = pd.DataFrame(standardize_train_features)
    train_labels = train_data.loc[:,['TARGET_D']]

    test_features = test_data.drop(['TARGET_D'], axis=1, inplace=False)
    standardize_test_features = scaler.fit_transform(test_features)
    test_features = pd.DataFrame(standardize_test_features)
    test_labels = test_data.loc[:,['TARGET_D']]

    kFoldsX = train_features
    kFoldsY = train_labels


    maeTrain_list = []
    maeValidation_list = []
    size = int(len(train_features)/5)
    lambdaValues = range(-3,11)

    #CV with 5 folds for each lambda value
    for l in lambdaValues:
        start = 0
        maeTrain = 0
        maeValidation = 0
        for k in range(5):
            CVx = kFoldsX[start:(k+1)*size]
            CVy = kFoldsY[start:(k+1)*size]
            trainX = pd.concat([kFoldsX[:start], kFoldsX[(k+1)*size:]])
            trainY = pd.concat([kFoldsY[:start], kFoldsY[(k+1)*size:]])
            start += size

            trainRidge = Ridge(alpha=(10 ** l))
            trainRidge.fit(trainX, trainY)
            CVPredict = trainRidge.predict(CVx)
            trainPredict = trainRidge.predict(trainX)

            maeTrain += float(np.mean(abs(trainY - trainPredict)))
            maeValidation += float(np.mean(abs(CVy - CVPredict)))
        print("lambda = 10^{}, TRAIN MAE: {}".format(l, maeTrain/5))
        print("lambda = 10^{}, CV MAE: {}".format(l, maeValidation/5))
        maeTrain_list.append(maeTrain/5)
        maeValidation_list.append(maeValidation/5)

    #plot graph
    plt.plot(lambdaValues, maeTrain_list, label ='MAE Training Data')
    plt.plot(lambdaValues, maeValidation_list, label= 'Validation Data')
    plt.xlabel('Lambda')
    plt.ylabel('MAE')
    plt.legend()
    plt.title('Ridge Regression Graph')
    plt.show()

    print("The best value for lambda is 10^4")
    testRidge = Ridge(alpha=(10 ** 4))
    testRidge.fit(train_features, train_labels)
    testPredict = testRidge.predict(test_features)
    newMAE = float(np.mean(abs(test_labels - testPredict)))
    print("The MAE value with lambda 10^4 is: {}".format(newMAE))
    print("The MAE value is significantly decreased compared to the first part.")
コード例 #41
0
ファイル: parc.py プロジェクト: j-cap/Data-Driven-Basics
        def fit_coeff(ii):
            # Given current cluster of points indexed by vector ii,
            # compute ridge regression/softmax regression problems, one per target

            alphaj = alpha * Nk[j] / N
            if not np.all(categorical):
                # Initialize ridge regressor
                ridge = Ridge(alpha=alphaj,
                              fit_intercept=True,
                              normalize=False)
                # Initialize softmax regressor for logistic regression)

            h = 0
            for i in range(ny):
                if not categorical[i]:
                    ridge.fit(X[ii, :], Yt[ii, i])
                    a[j][:, h] = ridge.coef_
                    b[j, h] = ridge.intercept_
                    h += 1
                else:
                    softmax_reg = softmax_regs[j][i]
                    softmax_reg.C = 0.5 / alphaj

                    tot_elems = cat_values[i]  # categories in entire dataset
                    elems = np.unique(
                        Yt[ii, i])  # categories in this cluster (ordered)
                    n_elems = len(elems)
                    if n_elems < numcat[i]:
                        # Possibly missing category values in this cluster still require their
                        # corresponding a,b coefficients/intercepts to be optimized.
                        # Therefore, we introduce here fake data points whose values
                        # equals the missing values (so to maintain coef/intercept order)
                        # and with zero weight
                        dn = numcat[i] - n_elems
                        softmax_weights = np.ones(Nk[j] + dn)
                        softmax_weights[0:dn] = 0.0
                        fake_values = np.setdiff1d(tot_elems,
                                                   elems,
                                                   assume_unique=True)

                        softmax_reg.fit(np.vstack((np.zeros(
                            (dn, nx)), X[ii, :])),
                                        np.vstack((fake_values.reshape(-1, 1),
                                                   Yt[ii,
                                                      i].reshape(Nk[j],
                                                                 1))).ravel(),
                                        sample_weight=softmax_weights)
                    else:
                        # no category is missing
                        softmax_reg.fit(X[ii, :], Yt[ii, i].ravel())

                    if numcat[i] == 2:
                        # binary target

                        # In this case LogisticRegression only returns one coeff_ and intercept_ value.
                        # LogisticRegression associates +coeff_/+intercept_ with **second** category (True),
                        # -coeff_/-intercept_ with **first** category (False). As category numbers are
                        # ordered from smallest to largest, the smallest value corresponds to False.

                        a[j][:, h] = -softmax_reg.coef_
                        b[j, h] = -softmax_reg.intercept_
                        h += 1
                        a[j][:, h] = softmax_reg.coef_
                        b[j, h] = softmax_reg.intercept_
                        h += 1

                        ##########
                        # DEBUG
                        ##########
                        # Y_pred = softmax_reg.predict(X[ii, :])
                        # from sklearn.metrics import accuracy_score
                        # print(accuracy_score(Y[ii, i], Y_pred))
                        ##########

                    else:
                        # multi-category softmax, each category has its own coeff_/intercept_
                        for t in range(numcat[i]):
                            a[j][:, h] = softmax_reg.coef_[t, :]
                            b[j, h] = softmax_reg.intercept_[t]
                            h += 1  # update coefficient/intercept index
            return
コード例 #42
0
# print(X[:10])
# print(Y[:10])
X['Memory'] = X['Memory'].apply(lambda x: float(str(x)[:-1]))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=3)
print len(X_test), len(y_test)
lr = LinearRegression()
lr.fit(X_train, y_train)
rr = Ridge(
    alpha=0.01
)  # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)
rr100 = Ridge(alpha=100)  #  comparison with alpha value
rr100.fit(X_train, y_train)
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train, y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train, y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)
print "linear regression train score:", train_score
print "linear regression test score:", test_score
print "ridge regression train score low alpha:", Ridge_train_score
print "ridge regression test score low alpha:", Ridge_test_score
print "ridge regression train score high alpha:", Ridge_train_score100
print "ridge regression test score high alpha:", Ridge_test_score100
# plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
コード例 #43
0
plt.show()

plt.scatter(X[:, 2], y[:, 0])
plt.title("y vs x_3: no relationship")
plt.show()

######################################
# Sklearn approach

from sklearn.linear_model import Ridge

start = time.time()

# Training
clf = Ridge(alpha=1.0)
log_train = clf.fit(X, y)

b = clf.coef_[0]

print("%.2f sec." % (time.time() - start), end=' - ')
print("Coefficients for x_1, x_2, x_3 are %.3f, %.3f, %.3f, respectively" %
      (b[0], b[1], b[2]))

# Testing
y_hat = clf.predict(X)
print("R square: %.3f" % r2_score(
    y, y_hat))  # R^2 (coefficient of determination) regression score function.

######################################
# xgboost approach
コード例 #44
0
ファイル: regression.py プロジェクト: irmaR/llm
if(len(y)<20):
	ncv=len(y)
	

for alpha in [0.0001,0.001,0.01,0.1, 1.0, 10.0,100.0]: 
#	reg2=Ridge(alpha=alpha)
	kf = KFold(len(y), n_folds=ncv)
	nd=0
	MSE_v=0.0
	loglkl=0.0
	loglkl1=0.0
	for train_index, test_index in kf:
		X_train, X_v = X[train_index], X[test_index]
		y_train, y_v = y[train_index], y[test_index]
		reg1=Ridge(alpha=alpha)
		reg1.fit(X_train,y_train)
		predytrain=reg1.predict(X_train)
		STD=math.sqrt(np.dot(np.array(y_train-predytrain),np.array(y_train-predytrain))/float(len(predytrain)-1.0))+1e-10 # to avoid problems
#		print STD,len(predytrain)
		predy_v=reg1.predict(X_v)
		MSE_v +=np.dot(np.array(predy_v-y_v),np.array(predy_v-y_v))#/float(len(y_v))
		loglkl +=loglklnormal(y_v,predy_v,STD)
#		loglkl1+=-np.dot(np.array(predy_v-y_v),np.array(predy_v-y_v))/STD**2/2.0+len(y_v)/2.0*math.log(1/STD**2)-len(y_v)/2.0*math.log(2.0*math.pi)
#		print loglkl,loglkl1
#	score=-cross_validation.cross_val_score(reg2, X, y, cv=ncv,scoring='mean_squared_error')
#	score=np.average(score)
#	print alpha,MSE_v,loglkl,STD
#	print score
	if(loglkl>bestscore):
		bestscore=loglkl
		bestalpha=alpha
コード例 #45
0
def fit_regression_model(reisende):
    X, y = preprocess(reisende)
    model = Ridge(alpha=10, fit_intercept=False)
    model.fit(X, y)
    return model
コード例 #46
0
def myliner():
    '''
    线性回归直接预测房子价格
    :return: None
    '''
    # 获取数据
    lb = load_boston()

    # 分割数据集到训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(lb.data,
                                                        lb.target,
                                                        test_size=0.25)

    print(y_train, y_test)

    # 进行标准化处理()
    # 特征值和目标值是都必须进行标准化处理,实例化两个标准化API
    std_x = StandardScaler()
    x_train = std_x.fit_transform(x_train)
    x_test = std_x.transform(x_test)

    # 目标值
    std_y = StandardScaler()
    y_train = std_y.fit_transform(y_train.reshape(-1, 1))
    y_test = std_y.transform(y_test.reshape(-1, 1))  # 要求数据必须是二维

    # estimator预测
    # 正规方程求解方式预测结果
    lr = LinearRegression()

    lr.fit(x_train, y_train)

    print(lr.coef_)

    # 保存训练好的模型
    #joblib.dump(lr,'./test.pkl')
    #导出模型
    #model = joblib.load('./test.pkl')

    # 预测测试集的房子价格
    y_lr_predict = std_y.inverse_transform(lr.predict(x_test))

    print("正规方程测试集里面每个样本的预测价格:", y_lr_predict)

    # y_test需要转换到标准化之前的值
    print("正规方程的均方误差:",
          mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict))

    print("*" * 100)

    # 梯度下降进行房价预测
    sgd = SGDRegressor()

    sgd.fit(x_train, y_train)

    print(sgd.coef_)

    # 预测测试集的房子价格
    y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test))

    print("梯度下降测试集里面每个样本的预测价格:", y_sgd_predict)

    print("梯度下降测的均方误差:",
          mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

    # 岭回归进行房价预测
    rd = Ridge(alpha=1.0)

    rd.fit(x_train, y_train)

    print(rd.coef_)

    # 预测测试集的房子价格
    y_rd_predict = std_y.inverse_transform(rd.predict(x_test))

    print("岭回归测试集里面每个样本的预测价格:", y_rd_predict)

    print("岭回归测的均方误差:",
          mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

    return None
コード例 #47
0
# start a training run
root_run = Run.start_logging(workspace=ws, history_name=run_history_name)

# list of numbers from 0.01 to 0.9 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)

print('start sequential parameter sweep...')

# try a bunch of alpha values in a Linear Regression (Ridge) model
for alpha in alphas:
    print('try alpha value of {0:.2f}'.format(alpha))
    # create a bunch of child runs
    with root_run.child_run("alpha" + str(alpha)) as run:
        # More data science stuff
        reg = Ridge(alpha=alpha)
        reg.fit(data["train"]["X"], data["train"]["y"])
        # TODO save model
        preds = reg.predict(data["test"]["X"])
        mse = mean_squared_error(preds, data["test"]["y"])
        # End train and eval

        # log alpha, mean_squared_error and feature names in run history
        run.log("alpha", alpha)
        run.log("mse", mse)
        run.log_list("columns", columns)

        with open(model_file_name, "wb") as file:
            joblib.dump(value=reg, filename=file)

        # upload the serialized model into run history record
        run.upload_file(name="outputs/" + model_file_name,
コード例 #48
0
ファイル: m07_linear.py プロジェクト: sglee-vcanus/etc
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=66,
                                                    shuffle=True,
                                                    test_size=0.2)

from sklearn.linear_model import LinearRegression, Ridge, Lasso

# 모델
model1 = LinearRegression()
model2 = Ridge()
model3 = Lasso()

model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)

linear_score = model1.score(x_test, y_test)
ridge_score = model2.score(x_test, y_test)
lasso_score = model3.score(x_test, y_test)

# 평가
print('linear_score: ', linear_score)
print('ridge_score: ', ridge_score)
print('lasso_score: ', lasso_score)

# y_pred = model1.predict(x_test)
# print(y_pred)
コード例 #49
0

# Make a Ridge model and run k-fold validation.
cv = KFold(n_splits=10, shuffle=True, random_state=42)
model = None

for train_ids, valid_ids in cv.split(X_train):
    model = Ridge(
        solver='auto',
        fit_intercept=True,
        alpha=0.5,
        max_iter=100,
        normalize=False,
        tol=0.05)

    model.fit(X_train[train_ids], y_train[train_ids])
    y_pred_valid = model.predict(X_train[valid_ids])
    rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
    print(f'valid rmsle: {rmsle:.5f}')
    break


# Predict on test set.
test = pd.read_table(input_folder + 'test.tsv')
X_test = extract_test_features(test, train_vectorizer)

test_ids = test['test_id'].values
y_pred_test = model.predict(X_test[test_ids])
print(y_pred_test)

result = pd.DataFrame(
コード例 #50
0
def accuracy_on_crimes():
    logger.info("Finding datasets...")
    directory = os.fsencode('input/Crimes_Workload')
    directory_sub = os.fsencode('input/Subqueries/')
    patterns = {'gauss-gauss': '*x-gauss*-length-gauss*',
               'gauss-uni': '*x-gauss*-length-uniform*',
               'uni-gauss': '*x-uniform*-length-gauss*',
               'uni-uni': '*x-uniform*-length-uniform*',}
    train_datasets = {}
    test_datasets = {}
    sub_datasets = {}

    for p in patterns:
        res = [os.fsdecode(n) for n in os.listdir(directory) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])]
        train_datasets[p] = res[0] if res[0].startswith('train') else res[1]
        test_datasets[p] = res[0] if res[0].startswith('test') else res[1]
        sub_datasets[p] = [os.fsdecode(n) for n in os.listdir(directory_sub) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])][0]

    res_eval = {'model': [],
               'dataset': [],
               'aggregate_name': [],
               'kl': [],
               'r2':[],
               'md':[],
               'nrmse':[]}
    #Main
    for p in patterns:
        logger.info('Beginning Evaluation for {0}'.format(p))
        logger.info('Loading Datasets...')

        test_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(test_datasets[p]), index_col=0)
        train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0)
        sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p]))

        logger.info('Finished loading\nCommencing Evaluation')
        aggregates = ['count','sum_','avg']
        agg_map = {'count' :4, 'sum_':5, 'avg':6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x','y','x_range','y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            kmeans = KMeans()
            lr = Ridge()

            lsnr = PR(lr)
            lsnr.fit(X_train,y_train)

            lr_global = LinearRegression()
            lr_global.fit(X_train, y_train)

            logger.info("Accuracy Evaluation on Test set\n=====================")
            for i in range(1000):
                #Obtain query from test-set
                dataset = p
                printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50)

                q = test_df.iloc[i].values[:4].reshape(1,-1)
                q = sc.transform(q)
                #Obtain subquery pertubations for query q from test set
                q1 = sub[i]
                X = q1[:,:4]
                y = q1[:,agg_map[agg]]
                X = sc.transform(X)
                # Train local model (Should be the best out of the 3)
                lr = LinearRegression()
                lr.fit(X,y)
                y_hat = lr.predict(X)
                metrics_for_model('local',dataset,agg,y_hat,X, y, lr,res_eval)

                #Obtain metrics for our
                y_hat_s = lsnr.get_model(q).predict(X)
                metrics_for_model('ours',dataset,agg,y_hat_s,X,y,lsnr.get_model(q) ,res_eval)


                #Obtain metrics for global
                y_hat_g = lr_global.predict(X)
                metrics_for_model('global',dataset,agg,y_hat_g,X,y,lr_global,res_eval)
            logger.info("Finished Queries")
    eval_df = pd.DataFrame(res_eval)
    eval_df.to_csv('output/Accuracy/evaluation_results_linear.csv')
コード例 #51
0
y = file['y']

lamda = []
lamda.extend([0.1, 1, 10, 100, 1000])
avg = []
RMSE = []

for l in range(0, 5):

    clf = Ridge(lamda[l])
    kf = KFold(10)
    #i=1
    #kf.folds returns 10 folds each one of them containing two arrays -
    # one with the indices needed for the training set and one with the indices for the test set
    for train_index, test_index in kf.split(
            X):  #trainindex: array of indexes of traindata,
        #print("Fold", i, ":")
        #print("TRAIN:", train_index,  "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        #i=i+1
        RMSE.append(mean_squared_error(y_pred, y_test)**0.5)
    avg.append(np.mean(RMSE))

# output results
d = {'error': avg}
output = pd.DataFrame(d)
output.to_csv('task_1a_output.csv', index=False, header=False)
コード例 #52
0
    n_informative=
    1,  #The number of informative features, i.e., the number of features used to
    #build the linear model used to generate the output. (default = 10)
    n_targets=
    1,  #The number of regression targets, i.e., the dimension of the y output vector
    #associated with a sample. By default, the output is a scalar. (default = 1)
    noise=
    5,  #The standard deviation of the gaussian noise applied to the output. (default = 0.0)
    coef=
    True,  #If True, the coefficients of the underlying linear model are returned. (default = False)
    random_state=1  #Determines random number generation for dataset creation.
)

#This is identitical to linear regression. A high alpha makes bias high.
rr = Ridge(alpha=1)
rr.fit(X, y)
w = rr.coef_[0]
plt.scatter(X, y)
#regression line, will be same as linear regression since alpha is 1
plt.plot(X, X * w, c='red')
plt.show()

#Increasing alpha gives us a less steep slope which increases bias a little to hopefully
#decrease variance by a lot
rr = Ridge(alpha=10)
rr.fit(X, y)
w = rr.coef_[0]
plt.scatter(X, y)
plt.plot(X, X * w, c='red')
plt.show()
コード例 #53
0
temp = np.delete(temp, np.argwhere(temp[:, 1] <= 0.1632), 0)

X_embedding = temp[:, 0:n_efeatures]
X_train = temp[:, n_efeatures:n_efeatures + n_features]
y_train = temp[:, -1]

print(X_embedding.shape)
print(y_train.shape)
print(X_train.shape)
n_samples = y_train.shape[0]

n_train = int(n_samples * 0.8)

x_t = X_train[0:n_train]
x_e = X_embedding[0:n_train]

y_t_test = y_train[n_train:]
x_t_test = X_train[n_train:]
x_e_test = X_embedding[n_train:]

lasso_reg = Ridge(alpha=1, solver="cholesky")
# lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(x_t, x_e)
x_e_test_predict = lasso_reg.predict(x_t_test)
print(np.linalg.norm(x_e_test - x_e_test_predict)**2 / x_e_test.shape[0])

plot_embedding(x_e_test,
               y_t_test.ravel().tolist(), "SLE_for_digital(alpha=0.02)")
plot_embedding(x_e_test_predict,
               y_t_test.ravel().tolist(), "Ridge_for_SLE(alpha=0.02)")
plt.show()
コード例 #54
0
def create_model(df, y, X, X_train, X_test, y_train, y_test, degree,
                 random_state, test_size, alpha):

    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    ss = StandardScaler()
    ss.fit(X_train)

    X_train_scaled = ss.transform(X_train)
    X_test_scaled = ss.transform(X_test)

    linreg_norm = LinearRegression()
    linreg_norm.fit(X_train_scaled, y_train)

    X_cat = df[['Month', 'Origin', 'Dest']]
    X_train_cat, X_test_cat, y_train, y_test = train_test_split(
        X_cat, y, test_size=test_size, random_state=random_state)
    # OneHotEncode Categorical variables
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(X_train_cat)

    X_train_ohe = ohe.transform(X_train_cat)
    X_test_ohe = ohe.transform(X_test_cat)

    columns = ohe.get_feature_names(input_features=X_train_cat.columns)
    cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)
    cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns)
    X_train_all = pd.concat([pd.DataFrame(X_train_scaled), cat_train_df],
                            axis=1)
    X_test_all = pd.concat([pd.DataFrame(X_test_scaled), cat_test_df], axis=1)
    linreg_all = LinearRegression()
    linreg_all.fit(X_train_all, y_train)

    print('Baseline model Continuous and Categorical')
    print('Training r^2:', linreg_all.score(X_train_all, y_train))
    print('Testing r^2:', linreg_all.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, linreg_all.predict(X_train_all)))
    print('Testing MSE:',
          mean_squared_error(y_test, linreg_all.predict(X_test_all)))

    print("\n")

    lasso = Lasso(alpha=alpha)  #Lasso is also known as the L1 norm.
    lasso.fit(X_train_all, y_train)
    print('Lasso')
    print('Training r^2:', lasso.score(X_train_all, y_train))
    print('Testing r^2:', lasso.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, lasso.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             lasso.predict(X_test_all)))

    print("\n")

    ridge = Ridge(alpha=alpha)  #Ridge is also known as the L2 norm.
    ridge.fit(X_train_all, y_train)
    print('Ridge')
    print('Training r^2:', ridge.score(X_train_all, y_train))
    print('Testing r^2:', ridge.score(X_test_all, y_test))
    print('Training MSE:',
          mean_squared_error(y_train, ridge.predict(X_train_all)))
    print('Testing MSE:', mean_squared_error(y_test,
                                             ridge.predict(X_test_all)))

    print("\n")

    poly_features = PolynomialFeatures(degree)

    # transforms the existing features to higher degree features.
    X_train_poly = poly_features.fit_transform(X_train)

    # fit the transformed features to Linear Regression
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)

    # predicting on training data-set
    y_train_predicted = poly_model.predict(X_train_poly)

    # predicting on test data-set
    y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))

    # evaluating the model on training dataset
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    r2_train = r2_score(y_train, y_train_predicted)

    # evaluating the model on test dataset
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict))
    r2_test = r2_score(y_test, y_test_predict)

    print("\n")

    print(" Polynomial training set")

    print("MSE of training set is {}".format(rmse_train))
    print("R2 score of training set is {}".format(r2_train))

    print("\n")

    print("Polynomial test set")

    print("MSE of test set is {}".format(rmse_test))
    print("R2 score of test set is {}".format(r2_test))

    print("\n")

    print('Cross Validation for Polynomial model')

    lm = LinearRegression()

    # store scores in scores object
    # we can't use accuracy as our evaluation metric since that's only relevant for classification problems
    # RMSE is not directly available so we will use MSE
    scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='r2')
    mse_scores = cross_val_score(lm,
                                 X_train_poly,
                                 y_train,
                                 cv=10,
                                 scoring='neg_mean_squared_error')
    print('Cross Validation Mean r2:', np.mean(scores))
    print('Cross Validation Mean MSE:', np.mean(mse_scores))
    print('Cross Validation 10 Fold Score:', scores)
    print('Cross Validation 10 Fold mean squared error', -(mse_scores))
コード例 #55
0
std_x = sc.fit_transform(x)
pca = PCA()
pca_x = pca.fit_transform(std_x)

# Compute percentile rankings for pca-space features
xp = feature_percentiles(pca_x)

# Run gridsearch on ridge regression to find optimal hp's
gs = GridSearchCV(Ridge(),
                  {'alpha': [0.1, 0.3, 0.6, 1, 3, 6.0, 10, 30, 60, 100]},
                  n_jobs=1, cv=10, scoring='neg_mean_squared_error')
gs.fit(pca_x, logy)

# Store PCA component compositions and feature importance ranks
r = Ridge(**gs.best_estimator_.get_params())
r.fit(pca_x, logy)
feat_imp = np.reshape(r.coef_, (1, len(r.coef_)))
pca_comp = pca.components_
pcfi_US = pd.DataFrame(np.vstack((feat_imp, pca_comp)),
                                 columns=df_us.columns[1:-1])

# Store pca-space features
pc_labels = ['PC#' + str(i) for i in range(1,xp.shape[1]+1)]
data_US = pd.DataFrame(np.hstack((np.reshape(df_us.values[:,0],(xp.shape[0],1)),xp)), columns=[df_us.columns[0]]+pc_labels)
data_US[geo] = pd.to_numeric(data_US[geo], downcast='integer')

# Write results files to disk
data_US.to_csv('data_' + 'US' + '.csv', index=False, float_format='%.2f')
pcfi_US.to_csv('pcfi_' + '_' + 'US' + '.csv', index=True, index_label='PC', float_format='%.5f')

print('National complete.')
コード例 #56
0
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score

X, y = datasets.load_diabetes(return_X_y=True)

n_alphas = 20
rr_alphas = alphas = np.logspace(-10, 10, n_alphas)
rr_coefs = []
rr_coefs = np.zeros((X.shape[-1], n_alphas))
rr_pred = np.zeros((y.shape[-1], n_alphas))
for aa in range(len(rr_alphas)):
    RR = Ridge(alpha=rr_alphas[aa], fit_intercept=True)
    RR.fit(X, y)
    rr_coefs[:, aa] = RR.coef_
    rr_pred[:, aa] = cross_val_predict(RR, X, y)

fracs = np.linspace(0, 1, n_alphas)
FR = FracRidge(fracs=fracs, fit_intercept=True)
FR.fit(X, y)
fr_pred = cross_val_predict(FR, X, y)

fig, ax = plt.subplots(1, 2)
ax[0].plot(fracs, FR.coef_.T)
ylims = ax[0].get_ylim()
ax[0].vlines(fracs, ylims[0], ylims[1], linewidth=0.5, color='gray')
ax[0].set_ylim(*ylims)

ax[1].plot(np.log(rr_alphas[::-1]), rr_coefs.T)
 'pH', 'sulphates', 'alcohol']


pdx = wine_quality[all_colnms]
pdy = wine_quality["quality"]

x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42)

alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]

initrsq = 0

print ("\nRidge Regression: Best Parameters\n")
for alph in alphas:
    ridge_reg = Ridge(alpha=alph) 
    ridge_reg.fit(x_train,y_train)    
    tr_rsqrd = ridge_reg.score(x_train,y_train)
    ts_rsqrd = ridge_reg.score(x_test,y_test)    

    if ts_rsqrd > initrsq:
        print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5))
        initrsq = ts_rsqrd

# Coeffients of Ridge regression of best alpha value
ridge_reg = Ridge(alpha=0.001) 
ridge_reg.fit(x_train,y_train) 
 

print ("\nRidge Regression coefficient values of Alpha = 0.001\n")
for i in range(11):
    print (all_colnms[i],": ",ridge_reg.coef_[i])
コード例 #58
0
# ls = Lasso()
# ls.fit(X_train,y_train)
# ls_pred = ls.predict(X_test)
# print('Lasso Regression Performance:')
# print('MAE:', metrics.mean_absolute_error(y_test, ls_pred))
# print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ls_pred)))
# print('R2_Score: ', metrics.r2_score(y_test, ls_pred))
# fig = plt.figure(figsize=(8, 5))
# sns.regplot(y_test,ls_pred,color='g')
# plt.xlabel('COA')
# plt.ylabel('Predictions')
# plt.title('Lasso Prediction Performance ')
# plt.grid()

rg = Ridge()
rg.fit(X_train, y_train)
rg_pred = rg.predict(X_test)
print('Ridge Regression Performance:')
print('MAE:', metrics.mean_absolute_error(y_test, rg_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rg_pred)))
print('R2_Score: ', metrics.r2_score(y_test, rg_pred))
fig = plt.figure(figsize=(8, 5))
sns.regplot(y_test, rg_pred, color='g')
plt.xlabel('COA')
plt.ylabel('Predictions')
plt.title('Ridge Prediction Performance ')
plt.grid()

# rf = RandomForestRegressor(n_estimators=100)
# rf.fit(X_train,y_train)
# rf_pred = rf.predict(X_test)
コード例 #59
0
class Regressor():
    """
    Wraps scikitlearn regressors.


    Parameters
    ----------

    strategy : string, defaut = "LightGBM" (if installed else "XGBoost")
        The choice for the regressor.
        Available strategies = "LightGBM" (if installed), "XGBoost",
        "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"

    **params : parameters of the corresponding regressor.
        Examples : n_estimators, max_depth...

    """
    def __init__(self, **params):

        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            if (lgbm_installed):
                self.__strategy = "LightGBM"
            else:
                self.__strategy = "XGBoost"

        self.__regress_params = {}

        self.__regressor = None
        self.__set_regressor(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False

    def get_params(self, deep=True):

        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__regress_params)

        return params

    def set_params(self, **params):

        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_regressor(params['strategy'])

            for k, v in self.__regress_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)

        for k, v in params.items():
            if (k == "strategy"):
                pass
            else:
                if k not in self.__regressor.get_params().keys():
                    warnings.warn("Invalid parameter for regressor " +
                                  str(self.__strategy) +
                                  ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)
                    self.__regress_params[k] = v

    def __set_regressor(self, strategy):

        self.__strategy = strategy

        if (strategy == 'RandomForest'):
            self.__regressor = RandomForestRegressor(n_estimators=400,
                                                     max_depth=10,
                                                     max_features='sqrt',
                                                     bootstrap=True,
                                                     n_jobs=-1,
                                                     random_state=0)

        elif (strategy == 'XGBoost'):
            self.__regressor = XGBRegressor(n_estimators=500,
                                            max_depth=6,
                                            learning_rate=0.05,
                                            colsample_bytree=0.8,
                                            colsample_bylevel=1.,
                                            subsample=0.9,
                                            nthread=-1,
                                            seed=0)

        elif (strategy == "LightGBM"):
            if (lgbm_installed):
                self.__regressor = LGBMRegressor(n_estimators=500,
                                                 learning_rate=0.05,
                                                 colsample_bytree=0.8,
                                                 subsample=0.9,
                                                 nthread=-1,
                                                 seed=0)
            else:
                warnings.warn(
                    "Package lightgbm is not installed. Model LightGBM will be"
                    "replaced by XGBoost")
                self.__strategy = "XGBoost"
                self.__regressor = XGBRegressor(n_estimators=500,
                                                max_depth=6,
                                                learning_rate=0.05,
                                                colsample_bytree=0.8,
                                                colsample_bylevel=1.,
                                                subsample=0.9,
                                                nthread=-1,
                                                seed=0)

        elif (strategy == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor(n_estimators=400,
                                                   max_depth=10,
                                                   max_features='sqrt',
                                                   bootstrap=True,
                                                   n_jobs=-1,
                                                   random_state=0)

        elif (strategy == 'Tree'):
            self.__regressor = DecisionTreeRegressor(
                criterion='mse',
                splitter='best',
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                min_weight_fraction_leaf=0.0,
                max_features=None,
                random_state=0,
                max_leaf_nodes=None,
                presort=False)

        elif (strategy == "Bagging"):
            self.__regressor = BaggingRegressor(base_estimator=None,
                                                n_estimators=500,
                                                max_samples=.9,
                                                max_features=.85,
                                                bootstrap=False,
                                                bootstrap_features=False,
                                                n_jobs=-1,
                                                random_state=0)

        elif (strategy == "AdaBoost"):
            self.__regressor = AdaBoostRegressor(base_estimator=None,
                                                 n_estimators=400,
                                                 learning_rate=.05,
                                                 random_state=0)

        elif (strategy == "Linear"):
            self.__regressor = Ridge(alpha=1.0,
                                     fit_intercept=True,
                                     normalize=False,
                                     copy_X=True,
                                     max_iter=None,
                                     tol=0.001,
                                     solver='auto',
                                     random_state=0)

        else:
            raise ValueError(
                "Strategy invalid. Please choose between 'LightGBM' "
                "(if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', "
                "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")

    def fit(self, df_train, y_train):
        """

        Fits Regressor.

        Parameters
        ----------

        df_train : pandas dataframe of shape = (n_train, n_features)
        The train dataset with numerical features.

        y_train : pandas series of shape = (n_train, )
        The target for regression tasks.


        Returns
        -------
        self

        """

        # sanity checks
        if ((type(df_train) != pd.SparseDataFrame)
                and (type(df_train) != pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__regressor.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self

    def feature_importances(self):
        """
        Computes feature importances. Regressor must be fitted before.

        Parameters
        ----------

        None

        Returns
        -------

        importance : dict
            Dictionnary containing a measure of feature importance (value)
            for each feature (key).

        """

        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}
                f = np.abs(self.get_estimator().coef_)

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in [
                    "LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree"
            ]):

                importance = {}
                f = self.get_estimator().feature_importances_

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    # XGB, RF, ET, Tree and AdaBoost
                    # TODO: Refactor this part
                    f = sum(
                        weight * est.feature_importances_
                        for weight, est in zip(
                            self.get_estimator().estimator_weights_,
                            self.get_estimator().estimators_)) / norm  # noqa

                except Exception:
                    f = sum(weight * np.abs(est.coef_) for weight, est in zip(
                        self.get_estimator().estimator_weights_,
                        self.get_estimator().estimators_)) / norm  # noqa

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        # XGB, RF, ET, Tree and AdaBoost
                        f = b.feature_importances_
                    except Exception:
                        f = np.abs(b.coef_)  # Linear

                    estimator = self.get_estimator()
                    items = enumerate(estimator.estimators_features_[i])
                    for j, c in items:
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    importance[col] = np.mean(
                        filter(lambda x: x != 0, [
                            k[col] if col in k else 0 for k in importance_bag
                        ]))

            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")

    def predict(self, df):
        '''

        Predicts the target.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        y : array of shape = (n, )
        The target to be predicted.

        '''

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")

    def transform(self, df):
        '''

        Transforms df.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        df_transform : pandas dataframe of shape = (n, n_selected_features)
        The transformed dataset with its most important features.

        '''

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.transform(df.values)
        else:
            raise ValueError("You must call the fit function before !")

    def score(self, df, y, sample_weight=None):
        """

        Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        score : float
        R^2 of self.predict(df) wrt. y.

        """

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame)
                    and (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if (type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__regressor.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")

    def get_estimator(self):
        return copy(self.__regressor)
コード例 #60
0
    
################################################## RIDGE REGRESSION

# PARAMETER TUNING

features = ['c1','c2','c3','c4','c5','c6','c7','c8']

msk = np.random.rand(len(tf)) < 0.8
train = tf[msk].reset_index(drop=True)
test = tf[~msk].reset_index(drop=True)

row_list = []

for n in range(0,1001):
    clf = Ridge(alpha=n)
    clf.fit(train[features],train.nrtg)
    score = clf.score(test[features],test.nrtg)
    dict1 = {'alpha':n,'score':score}
    row_list.append(dict1)
    
alpha_df = pd.DataFrame(row_list)

alpha = alpha_df[alpha_df.score == alpha_df.score.max()].alpha.values[0]

# RIDGE REGRESSION

clf = Ridge(alpha=alpha)

clf.fit(tf[features],tf.nrtg)

coefficients = clf.coef_