def solveSingle(self,inputDF,outputDict,rho,beta_target):
     I,J,V,Y=[],[],[],[]
     fd = {} # mapping feature names to consecutive integers, starting with 0
     for i,(id, x) in enumerate(inputDF.items()):
         l = outputDict.get(id)
         for k,v in x.items():
             I.append(i)
             J.append(k)
             V.append(v)
             upd(fd,k)
         Y.append(l)
     J = map(lambda k: fd[k], J)
     X = sparse.coo_matrix((V,(I,J)),shape=(I[-1]+1,len(fd)))
     fd_reverse = [k for k,v in sorted(fd.items(), key = lambda t: t[1])]
     # y_new = y - X . beta_target
     # converting a proximal least square problem to a ridge regression
     ZmUl = np.array([beta_target.get(k,0) for k in fd_reverse])
     y_new = np.array(Y) - X * ZmUl
     ridge = Ridge(alpha =  rho , fit_intercept=False)
     ret = ridge.fit(X,y_new)
     #ret = self.lr.fit(X,y_new)
     # ordered list of feature names according to their integer ids in fd
     #raise ValueError('fd_reverse = %s \n X = %s \n J = %s \n I = %s \n V = %s \n Y = %s \n y_new = %s \n ret.coef_ = %s \n ZmUl = %s \n'\
     #            %(str(fd_reverse), str(X), str(J), str(I), str(V), str(Y), str(y_new), str(ret.coef_), str(ZmUl)))
     return dict(zip(fd_reverse, (ret.coef_ + ZmUl).tolist()))
Ejemplo n.º 2
0
def test_brr_like_sklearn():
    n = 10000
    d = 10
    sigma_sqr = 5
    X = np.random.randn(n, d)
    beta_true = np.random.random(d)
    y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n)
    X_tr = X[:n / 2, :]
    y_tr = y[:n / 2]
    X_ts = X[n / 2:, :]
    #  y_ts = y[n / 2:]

    # prediction with my own bayesian ridge
    lambda_reg = 1
    brr = BayesianRidgeRegression(lambda_reg,
                                  add_ones=True,
                                  normalize_lambda=False)
    brr.fit(X_tr, y_tr)
    y_ts_brr = brr.predict(X_ts)

    # let's compare to scikit-learn's ridge regression
    rr = Ridge(lambda_reg)
    rr.fit(X_tr, y_tr)
    y_ts_rr = rr.predict(X_ts)

    assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \
        "Predictions are different from sklearn's ridge regression."
Ejemplo n.º 3
0
def test_sag_regressor_computed_correctly():
    """tests if the sag regressor is computed correctly"""
    alpha = .1
    n_features = 10
    n_samples = 40
    max_iter = 50
    tol = .000001
    fit_intercept = True
    rng = np.random.RandomState(0)
    X = rng.normal(size=(n_samples, n_features))
    w = rng.normal(size=n_features)
    y = np.dot(X, w) + 2.
    step_size = get_step_size(X, alpha, fit_intercept, classification=False)

    clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag',
                 alpha=alpha * n_samples, max_iter=max_iter)
    clf2 = clone(clf1)

    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha,
                                          n_iter=max_iter,
                                          dloss=squared_dloss,
                                          fit_intercept=fit_intercept)

    spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha,
                                          n_iter=max_iter,
                                          dloss=squared_dloss, sparse=True,
                                          fit_intercept=fit_intercept)

    assert_array_almost_equal(clf1.coef_.ravel(),
                              spweights1.ravel(),
                              decimal=3)
    assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
Ejemplo n.º 4
0
def ridgeReg(alpha):
    n_samples, n_features = 10, 5
    y = np.random.randn(n_samples)
    X = np.random.randn(n_samples, n_features)
    clf = Ridge(.001)
    res=clf.fit(X, y)
    return(res)
Ejemplo n.º 5
0
    def fit(self, X, Y, weights=None, context_transform=True):
        """ Trains policy by weighted maximum likelihood.

        .. note:: This call changes this policy (self)

        Parameters
        ----------
        X: array-like, shape (n_samples, context_dims)
            Context vectors

        Y: array-like, shape (n_samples, weight_dims)
            Low-level policy parameter vectors

        weights: array-like, shape (n_samples,)
            Weights of individual samples (should depend on the obtained
            reward)
        """
        # Kernel approximation
        self.nystroem = Nystroem(
            kernel=self.kernel,
            gamma=self.gamma,
            coef0=self.coef0,
            n_components=np.minimum(X.shape[0], self.n_components),
            random_state=self.random_state,
        )
        self.X = self.nystroem.fit_transform(X)
        if self.bias:
            self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))
        if self.normalize:
            self.X /= np.abs(self.X).sum(1)[:, None]

        # Standard ridge regression
        ridge = Ridge(alpha=self.alpha, fit_intercept=False)
        ridge.fit(self.X, Y, weights)
        self.W = ridge.coef_
Ejemplo n.º 6
0
class OrderScorer(Scorer):

    def __init__(self):
        self.classifier = Ridge(alpha=0.1)
        self.cache_filename = 'subgraph_order_scorer_reg.pickle'

    def train(self, train_instances, train_labels, update_cache=True,
              sample_weight=None):
        """
        Trains a scorer to score the quality of an ordering of sentences
        Loads from cache if available
        """
        self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight)
        if update_cache:
            pickle.dump(self.classifier, open(self.cache_filename, 'wb'))

    def test(self, test_instances, test_labels):
        """ Uses test set to evaluate the performance of the scorer and print it out """
        scores = self.classifier.predict(test_instances)
        # TODO: print report

    def load(self):
        if os.path.exists(self.cache_filename):
            self.classifier = pickle.load(open(self.cache_filename, 'rb'))
        else:
            raise Exception("No classifier exists! Must call train with update_cache=True") 

    def evaluate(self, test_instance):
        """ Applies the scoring function to a given test instance """
        return self.classifier.predict([test_instance])[0]
Ejemplo n.º 7
0
def test_regressor_matching():
    n_samples = 10
    n_features = 5

    rng = np.random.RandomState(10)
    X = rng.normal(size=(n_samples, n_features))
    true_w = rng.normal(size=n_features)
    y = X.dot(true_w)

    alpha = 1.
    n_iter = 100
    fit_intercept = True

    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
    clf = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag',
                alpha=alpha * n_samples, max_iter=n_iter)
    clf.fit(X, y)

    weights1, intercept1 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter,
                                      dloss=squared_dloss,
                                      fit_intercept=fit_intercept)
    weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter,
                               dloss=squared_dloss,
                               fit_intercept=fit_intercept)

    assert_array_almost_equal(weights1, clf.coef_, decimal=10)
    assert_array_almost_equal(intercept1, clf.intercept_, decimal=10)
    assert_array_almost_equal(weights2, clf.coef_, decimal=10)
    assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50):
    # X and Y are numpy arrays
    print 'Input data and label shape: ', X.shape, Y.shape

    if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor)

    model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim)
    Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose())


    regressors = []
    for i in range(dim):
        print 'at regressor number: ', i
        reg = Ridge() if regressor=='ridge' else SVR()
        y = [x[i] for x in Y_train]
        reg.fit(X, y)
        regressors.append(reg)

    Z_pred = []
    for reg in regressors:
        Z_pred.append(reg.predict(X_test))
    print 'prediction shapes:' , len(Z_pred), len(Z_pred[0])
    Z_pred = np.array(Z_pred)
    Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P)
    return model, regressors, Y_pred
Ejemplo n.º 9
0
class LogisticRegressionSeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets predict which users will spend anything later
        classes = y - X[:, 0]
        classes = np.where(classes > 0.1, 1, 0)

        self.classifier = LogisticRegression(
                class_weight='balanced')

        self.classifier.fit(X, classes)
        results = self.classifier.predict(X)
        results = results == 1

        self.estimator = Ridge(alpha=0.05)
        self.estimator.fit(X[results], y[results])

    def predict(self, X):
        y = X[:,0].reshape(X.shape[0])
        labels = (self.classifier.predict(X) == 1)
        y[labels] = self.estimator.predict(X[labels])
        return y
Ejemplo n.º 10
0
def train_single_model(train_data, train_labels, algo):
	"""
	Train the model for a single label dimension
	"""
	if algo == 'svr_rbf':
		"""
		SVM regression, RBF kernel
		"""
		svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
		svr_rbf.fit(train_data, train_labels)
		return svr_rbf

	if algo == 'svr_lin':
		"""
		SVM regression, linear
		"""
		svr_lin = SVR(kernel='linear')
		svr_lin.fit(train_data, train_labels)
		return svr_lin

	if algo == 'ridge':
		"""
		Ridge regression
		"""
		clf = Ridge(alpha = 0.5)
		clf.fit(train_data, train_labels)
		return clf

	# No hit algorithm
	print "unimplemented model type"
	return None
Ejemplo n.º 11
0
 def regression_weight(self, matched_data):
     converted_data = {}
     for i, data in enumerate(matched_data):
         if i==0:
             for key in data.keys():
                 try:
                     value = float(data[key])
                     converted_data[key] = [value]
                 except ValueError:
                     pass
         else:
             for key in data.keys():
                 if key in converted_data:
                     converted_data[key].append(float(data[key]))
     sorted_key = sorted(converted_data.keys())
     input_key = [key for key in sorted_key if key != self.main_key.lower()]
     x = [] 
     for key in input_key:
         # normalization
         numpy_data = normalization(np.array(converted_data[key]))      
         x.append(numpy_data)
     x = np.array(x).T
     y = normalization(np.array(converted_data[self.main_key.lower()]))
     regressor = Ridge(alpha=1.0, normalize=True)
     regressor.fit(x,y)
     sorted_result = np.array(input_key)[np.argsort(np.array(regressor.coef_))]
     sorted_result = sorted_result[::-1]
     coefficient = sorted(regressor.coef_, reverse = True)
     return [(sorted_result[i], coefficient[i]) for i in range(len(sorted_result))]
Ejemplo n.º 12
0
def regression_NumMosquitos(Xtr, ytr, Xte):
    from sklearn.linear_model import Ridge, RidgeCV
    #model_nm = RidgeCV(alphas=range(200, 401, 10), cv=5)
    model_nm = Ridge(alpha = 340)
    model_nm = model_nm.fit(Xtr, ytr)
    results_nm = model_nm.predict(Xte)
    return results_nm
Ejemplo n.º 13
0
def ridgeRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Ridge Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    ridgeRegression = Ridge(alpha=1e-11,solver="cholesky")
    ridgeRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = ridgeRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-ridgeRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
def forecast_future_attention(train_index, test_index, alpha):
    """Forecast future attention via train dataset index and test dataset index."""
    m, n = len(train_index), len(test_index)
    x_train_predict = attention_data[train_index, :num_train]
    x_test_predict = attention_data[test_index, :num_train]
    for i in xrange(num_train, age):
        if with_share == 1:
            x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1]))
            x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1]))
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1]))
        else:
            x_train = x_train_predict
            x_test = x_test_predict
            norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1)))
        x_train_norm = x_train / np.sum(norm, axis=1)[:, None]
        y_train = np.ones(m, )

        # == == == == == == == == Training with Ridge Regression == == == == == == == == #
        predictor = Ridge(fit_intercept=False, alpha=alpha)
        predictor.fit(x_train_norm, y_train)

        # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == #
        predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1)
        predict_train_value[predict_train_value < 0] = 0
        x_train_predict = np.hstack((x_train_predict, predict_train_value))
        predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1)
        predict_test_value[predict_test_value < 0] = 0
        x_test_predict = np.hstack((x_test_predict, predict_test_value))
    return x_test_predict[:, num_train: age]
Ejemplo n.º 15
0
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
   :param train_x: train
   :param train_y: text
   :param pred_x: test set to predict
   :param review_id: takes in a review id
   :param v_curve: run the model for validation curve
   :param l_curve: run the model for learning curve
   :param get_model: run the model
   :return:the predicted values,learning curve, validation curve
   """
    lin = Ridge(alpha=0.5)
    if get_model:
        print "Fitting Ridge..."
        lin.fit(train_x, np.log(train_y+1))
        gbr_pred = np.exp(lin.predict(pred_x))- 1
        for i in range(len(gbr_pred)):
            if gbr_pred[i] < 0:
                gbr_pred[i] = 0
        Votes = gbr_pred[:, np.newaxis]
        Id = np.array(review_id)[:, np.newaxis]
        submission_lin= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0),
                              param_name="alpha", param_range=[0.1,0.2,0.5,1,10])
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
Ejemplo n.º 16
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict(X_test_cv)
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Ejemplo n.º 17
0
def knn_twice(k):
	knn1 = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn1.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = len(train)/1000
	pred1 = []
	for i in range(0,n):
		pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))]))
		print(i)
	pred1.extend(knn1.predict(trainf[67000:67946]))
	print "time: " + str(time.time() - tim)
	#knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	#knn.fit(pred1,trainlab)
	ridge = Ridge(alpha=1.0)
	ridge.fit(pred1, trainlab)

	n = 10
	pred2 = []
	for i in range(0,n):
		pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray()))
		print(i)	

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))]))
		print(i)	

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	return err
def cross_valid(X,Y,n_fold):
	clf = Ridge(alpha=1.0)
	total_mean_square = 0
	total_coef = 0
	Y_np = np.array(Y)
	n_samples, n_features = len(X), len(X[0])
	kf_Y = cross_validation.KFold(n_samples, n_fold)
	index = []
	preds = []
	truths = []
	for train_index, test_index in kf_Y:
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = Y_np[train_index], Y_np[test_index]
		

		clf.fit(X_train,y_train)
		y_pred = clf.predict(X_test)
		index += test_index.tolist()
		preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist())
		truths += y_test.tolist()
		#print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred)
		#print "original:",y_test

		total_mean_square += mean_squared_error(y_test,y_pred) 
		total_coef += clf.coef_
	
		#print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) 
	print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) 
	print 'Average mean squared error is: ' , total_mean_square / n_fold

	diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)])
	acc =  100-1.* diff_count/len(truths)*100
	print 'prediction accuracy is %f'%(acc)
	return [total_coef, index , preds]
Ejemplo n.º 19
0
    def __init__(self, penalty='l1', dual=None, C=None, alpha=None):

        self.l1 = True if penalty=="l1" else False
        if self.l1:
            Lasso.__init__(self, alpha=alpha)
        else:
            Ridge.__init__(self, alpha=alpha)
Ejemplo n.º 20
0
def impute_age():
    X, P = gfa.platform_expression("GPL96")
    model = impute.KNNImputer()
    Xi = model.fit_transform(X, axis=1)

    age = array(P["age"].tolist())
    Xm = Xi.as_matrix()
    ix = array((age >= 10) & (age <= 120)).nonzero()[0]
    np.random.shuffle(ix)
    Xm = Xm[ix, :]
    age = age[ix]

    n_train = 2000
    n_test = 500
    # clf = SVR(C=1e-5, epsilon=1)
    # clf = LinearRegression()
    clf = Ridge()
    # clf = SimpleRegressor()
    # clf = Lasso()
    clf.fit(Xm[:n_train, :], age[:n_train])
    y = age[n_train : (n_train + n_test)]
    y_hat = clf.predict(Xm[n_train : (n_train + n_test)])
    dy = y - y_hat

    bias_tr = y_hat.mean() - age.mean()
    print("\nBias (vs train):\t\t", bias_tr)
    print("Bias (vs test):\t\t\t", dy.mean())
    print("Mean error:\t\t\t", fabs(dy).mean())
    print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean())
    print("MSE:\t\t\t\t", np.power(dy, 2).mean())
Ejemplo n.º 21
0
 def RidgeRegression(self,filename,outputFile):
     pheno,geno = self.inputParse(filename)
     for row in geno:
                 if len(row)%2 !=0:
                         return "Rows are not even."
     maxGeno = max(geno)
     allGeno = list(set(maxGeno))
     encoder = [i for i in range(len(allGeno))]
     lengthGeno = len(geno)
     length = len(geno)
     lenInnerGeno = len(geno[0])
     genoMake = [0 for x in range(len(allGeno))]
     dictionary = dict(zip(allGeno,encoder))
     for i in range(length):
             for x in range(lenInnerGeno):
                     geno[i][x] = dictionary[geno[i][x]]
     phenoNaN = []
     for i in range(len(pheno)):
         if pheno[i] == 'NaN':
             phenoNaN.append(i)
     phenoNaN.reverse()
     for i in phenoNaN:
         del pheno[i]
     genoMiss = []
     for i in range(len(geno)):
         if i not in phenoNaN:
             genoMiss.append(geno[i])
     pheno = [float(i) for i in pheno]    
     alpha = self.alphaOptimization(genoMiss,pheno)
     clf = Ridge(alpha = alpha)
     clf.fit(genoMiss,pheno)
     predicted = clf.predict(geno)
     predicted = np.transpose(predicted)
     np.savetxt(outputFile,np.transpose(predicted))
Ejemplo n.º 22
0
def traverse_movies_ridge():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 100:
			model = Ridge(alpha = .5)
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			#P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	#print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
Ejemplo n.º 23
0
class RidgeRegressionModel(LinearLeastSquaresModel):
    def __init__(self, input_columns, output_columns, debug=False):

        self.alpha = 0.0000000001
        self.m = Ridge(alpha=self.alpha)

        super(RidgeRegressionModel, self).__init__(input_columns, output_columns, debug=debug)

    def fit(self, data):
        A = numpy.vstack([data[:,i] for i in self.input_columns]).T
        B = numpy.vstack([data[:,i] for i in self.output_columns]).T

        self.m.fit(A, B)

        return self.m.coef_   #m.intercept_

    def get_error(self, data, model):
        A = numpy.vstack([data[:,i] for i in self.input_columns]).T
        B = numpy.vstack([data[:,i] for i in self.output_columns]).T
        B_fit = scipy.dot(A, model)
        err_per_point = numpy.sum((B-B_fit)**2, axis=1) # sum squared error per row

        norm = numpy.sqrt(model*model)
        assert norm.shape == (1,1)
        regularizer = 1.0*norm[0,0]

        return err_per_point - regularizer
Ejemplo n.º 24
0
def add_weekly_overall_trends(df_shop, regressor, trend_name, coeff_name, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    df_shop[trend_name] = np.nan
    df_shop[coeff_name] = np.nan

    for m in range(biweek_max - 1, 0, -1):
        train_idx = df_shop.biweek_id >= m
        test_idx = df_shop.biweek_id == (m - 1)

        df_train = df_shop[train_idx]

        y = df_train[target]
        not_null = ~y.isnull()
        if not_null.sum() < 7:
            continue

        x = -df_train[regressor]
        x_not_null = x[not_null].values.reshape(-1, 1)
        y = y[not_null].values
        lr = Ridge(alpha=1).fit(x_not_null, y)

        if m == biweek_max - 1:
            x = x.values.reshape(-1, 1)
            df_shop.loc[train_idx, trend_name] = lr.predict(x)
            df_shop.loc[train_idx, coeff_name] = lr.coef_[0]

        df_test = df_shop[test_idx]
        x = -df_test[regressor].values.reshape(-1, 1)

        df_shop.loc[test_idx, trend_name] = lr.predict(x)
        df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
Ejemplo n.º 25
0
def _check_ridge_model(featureses, labels):
    """Plot ridge regression predictions"""
    for tfidf_count in FEATURES_SIZES:
        test_points = []
        for i in range(16):
            tmp = [i, 100]
            tmptmp = [0] * tfidf_count
            if tmptmp:
                tmp.extend(tmptmp)
            test_points.append(tmp)
        test_points = np.array(test_points)
        limit = tfidf_count + 2
        model = Ridge()
        model.fit(featureses[:, :limit], labels)
        predictions = model.predict(test_points)
        plt.plot(
            predictions,
            label=str(tfidf_count),
            linestyle=next(LINECYCLER),
            linewidth=3)
        # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count))
    plt.legend()
    plt.xlabel('Document order')
    plt.ylabel('Time (seconds)')
    plt.savefig('ridge_predictions.pdf')
Ejemplo n.º 26
0
def add_overall_trend_feature(df_shop, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    trend_name = 'trend_overall'
    coeff_name = 'trend_overall_coeff'
    df_shop[trend_name] = np.nan
    df_shop[coeff_name] = np.nan

    for m in range(biweek_max - 1, 0, -1):
        train_idx = df_shop.biweek_id >= m
        test_idx = df_shop.biweek_id == (m - 1)

        df_train = df_shop[train_idx]

        y = df_train[target]
        not_null = ~y.isnull()
        if not_null.sum() <= 7:
            continue

        x = df_train.days_from_beginning
        x_not_null = x[not_null].values.reshape(-1, 1)
        y = y[not_null].values
        lr = Ridge(alpha=1).fit(x_not_null, y)

        if m == biweek_max - 1:
            x = x.values.reshape(-1, 1)
            df_shop.loc[train_idx, trend_name] = lr.predict(x)
            df_shop.loc[train_idx, coeff_name] = lr.coef_[0]

        df_test = df_shop[test_idx]
        x = df_test.days_from_beginning.values.reshape(-1, 1)

        df_shop.loc[test_idx, trend_name] = lr.predict(x)
        df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
Ejemplo n.º 27
0
def add_window_trend_overall_features(df_shop, target='pays_count'):
    biweek_max = df_shop.biweek_id.max()

    for biweeks_past in [2, 3, 4, 5, 6, 12, 18]:
        trend_name = 'trend_%d' % biweeks_past
        trend_coef_name = 'trend_coef_%d' % biweeks_past
        df_shop[trend_name] = np.nan
        df_shop[trend_coef_name] = np.nan

        for m in range(biweek_max, biweeks_past, -1):
            m_past = m - biweeks_past
            train_idx = (df_shop.biweek_id >= m_past) & (df_shop.biweek_id <= m)
            test_idx = df_shop.biweek_id == (m_past - 1)

            df_rolling_train = df_shop[train_idx]
            df_rolling_test = df_shop[test_idx]

            y = df_rolling_train[target]
            not_null = ~y.isnull()
            if not_null.sum() <= 7:
                continue
        
            x = df_rolling_train.days_from_beginning
            x_not_null = x[not_null].values.reshape(-1, 1)
            y = y[not_null].values
            lr = Ridge(alpha=1).fit(x_not_null, y)

            if m == biweek_max:
                x = x.values.reshape(-1, 1)
                df_shop.loc[train_idx, trend_name] = lr.predict(x)
                df_shop.loc[train_idx, trend_coef_name] = lr.coef_[0]

            x_val = df_rolling_test.days_from_beginning.values.reshape(-1, 1)
            df_shop.loc[test_idx, trend_name] = lr.predict(x_val)
            df_shop.loc[test_idx, trend_coef_name] = lr.coef_[0]
Ejemplo n.º 28
0
def ridge_regressor(df):
    """
    INPUT: Pandas dataframe
    OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients
    """
    y = df.pop("price").values
    X = df.values
    feature_names = df.columns
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

    clf = Ridge(alpha=1.0)
    clf.fit(xtrain, ytrain)

    score = clf.score(xtest, ytest)
    feat_imps = clf.coef_
    ypredict = clf.predict(xtest)
    mae = np.mean(np.absolute(ytest - ypredict))
    mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest)
    return (
        "R^2 is ",
        score,
        "RMSE is ",
        rmse,
        "MAE percent is ",
        mae_percent,
        "Feature coefficients are ",
        zip(feature_names, feat_imps),
    )
Ejemplo n.º 29
0
def ridge_regression(data,target,alphas):
    plt.figure()
    mean_rmses=[]
    kf=KFold(len(target),10,True,None)
    for alpha0 in alphas:
        rmses=[]
        clf=Ridge(alpha=alpha0,normalize=True,solver='svd')
        for train_index, test_index in kf:
            data_train,data_test=data[train_index],data[test_index]
            target_train,target_test=target[train_index],target[test_index]
            clf.fit(data_train,target_train)
            rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
            rmses.append(rmse)
            
        mean_rmses.append(np.mean(rmses))
        x0=np.arange(1,11)
        plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o')
        
    lr = linear_model.LinearRegression(normalize = True)
    rmses = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        rmses.append(rmse)
    mean_rmses.append(np.mean(rmses))
    x0=np.arange(1,11)
    plt.plot(x0,rmses,label='linear',marker='*')
    
    plt.title("RMSE comparison between different alpha values of Ridge regularization")
    plt.legend()
    plt.show()
#    print(mean_rmses)
    return mean_rmses
Ejemplo n.º 30
0
def fit_strf_ridge(input, output, lags, alpha=1.0, verbose=False):

    #convert the input into a toeplitz-like matrix
    if verbose:
        nt,nf = input.shape
        nelems = nt*nf*len(lags)
        mem = (nelems*8.) / 1024.**2
        print '[fit_strf_ridge] estimated size of toeplitz matrix: %d MB' % mem
    stime = time.time()
    A = make_toeplitz(input, lags, include_bias=False)
    etime = time.time() - stime
    if verbose:
        print '[fit_strf_ridge] Time to make Toeplitz matrix: %d seconds' % etime

    #fit the STRF
    stime = time.time()

    #rr = Ridge(alpha=alpha, copy_X=False, fit_intercept=True)
    rr = Ridge(alpha=alpha, fit_intercept=True)
    rr.fit(A, output)
    etime = time.time() - stime
    if verbose:
        print '[fit_strf_ridge] Time to fit STRF: %d seconds' % etime

    #reshape the STRF so that it makes sense
    nt = input.shape[0]
    nf = input.shape[1]
    d = len(lags)
    strf = np.array(rr.coef_).reshape([nf, d])
    bias = rr.intercept_

    return strf,bias
Ejemplo n.º 31
0
cat_cols = all_columns[kinds == 'O']

num_pipe = Pipeline([('si', SimpleImputer(strategy='mean')),
                     ('ss', StandardScaler())])
ct = ColumnTransformer([('num', num_pipe, num_cols)])

X_num_tf = ct.fit_transform(train)

#------------------------------------------------------------------------------
ct = ColumnTransformer([('cat', cat_pipe, cat_cols),
                        ('num', num_pipe, num_cols)])

X = ct.fit_transform(train)

#------------------------------------------------------------------------------
ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])

ml_pipe.fit(train, y)
ml_pipe.score(train, y)

#-------------------------------------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=123)
cross_val_score(ml_pipe, train, y, cv=kf).mean()

#-------------------------------------------------------------------------------
param_grid = {
    'transform__num__si__strategy': ['mean', 'median'],
    'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000]
}

gs = GridSearchCV(ml_pipe, param_grid, cv=kf, return_train_score=True)
Ejemplo n.º 32
0
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)
            ), ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

ct = ColumnTransformer(
    transformers=[('numeric', numeric_transformer,
                   continuous), ('cats', categorical_transformer, cats)])

lr_pipe = make_pipeline(ct, LinearRegression())
ridge_pipe = make_pipeline(ct, Ridge())
lasso_pipe = make_pipeline(ct, Lasso())
elastic_pipe = make_pipeline(ct, ElasticNet())

### Fitting

# Only using subset of categorical variables

X_train[cats].dtypes

X_train = X_train.loc[:, continuous + cats]

X_train.shape
X_test.shape

X_test = X_test.loc[:, continuous + cats]
Ejemplo n.º 33
0
svr.fit(X_train_scaled, y_train)
svr_preds = svr.predict(X_val_scaled)
print('average error: $', round(np.sqrt(mean_squared_error(y_val, svr_preds)), 2), 'RMSE')
print('average percent error:', round(MAPE(y_val, svr_preds), 2), '%')

##Bayesian Model:
from sklearn.linear_model import BayesianRidge
br_reg = BayesianRidge()
br_reg.fit(X_train_scaled, y_train)
br_preds = br_reg.predict(X_val_scaled)
print('average error: $', round(np.sqrt(mean_squared_error(y_val, br_preds)), 2), 'RMSE')
print('average percent error:', round(MAPE(y_val, br_preds), 2), '%')

##Ridge Regression:
from sklearn.linear_model import Ridge
rid_reg = Ridge()
rid_reg.fit(X_train_scaled, y_train)
rid_preds = rid_reg.predict(X_val_scaled)
print('average error: $', round(np.sqrt(mean_squared_error(y_val, rid_preds)), 2), 'RMSE')
print('average percent error:', round(MAPE(y_val, rid_preds), 2), '%')

##Elastic Net:
from sklearn.linear_model import ElasticNet
en_reg = ElasticNet()
en_reg.fit(X_train_scaled, y_train)
en_preds = en_reg.predict(X_val_scaled)
print('average error: $', round(np.sqrt(mean_squared_error(y_val, en_preds)), 2), 'RMSE')
print('average percent error:', round(MAPE(y_val, en_preds), 2), '%')

#Plot the predictions:
import matplotlib.pyplot as plt
Ejemplo n.º 34
0
os.makedirs('./outputs', exist_ok=True)

boston_data = datasets.load_boston()

run = Run.get_context()
client = ExplanationClient.from_run(run)

X_train, X_test, y_train, y_test = train_test_split(boston_data.data,
                                                    boston_data.target,
                                                    test_size=0.2,
                                                    random_state=0)

alpha = 0.5
# Use Ridge algorithm to create a regression model
reg = Ridge(alpha)
model = reg.fit(X_train, y_train)

preds = reg.predict(X_test)
run.log('alpha', alpha)

model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file:
    joblib.dump(value=reg,
                filename=os.path.join('./outputs/', model_file_name))

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(model,
                                     X_train,
                                     features=boston_data.feature_names)
Ejemplo n.º 35
0
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from ridge_display_plot import display_plot

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
ridge = Ridge(normalize=True)

# Compute scores over range of alphas
for alpha in alpha_space:
    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha

    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)

    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))

    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
display_plot(ridge_scores, ridge_scores_std)
Ejemplo n.º 36
0
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']

# 对标签进行log(X+1变换),使其更贴近正态分布
train_y_ln = np.log(train_y + 1)

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

LinearRegressionModel = LinearRegression(normalize=True)
LinearRegressionModel = LinearRegressionModel.fit(train_X, train_y_ln)

RidgeModel = Ridge(normalize=True)
RidgeModel = RidgeModel.fit(train_X, train_y_ln)

LassoModel = Lasso().fit(train_X, train_y_ln)

# 非线性模型

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

DecisionTreeModel = DecisionTreeRegressor().fit(train_X, train_y_ln)
RandomForestModel = RandomForestRegressor().fit(train_X, train_y_ln)
GradientBoostingModel = GradientBoostingRegressor().fit(train_X, train_y_ln)

f = open('./models/LinearRegressionModel.pkl', 'xb')
Ejemplo n.º 37
0
def train_model(data, ridge_args):
    reg_model = Ridge(**ridge_args)
    reg_model.fit(data["train"]["X"], data["train"]["y"])
    return reg_model
Ejemplo n.º 38
0
plt.title("Relationship between RM and Price")
plt.savefig('./RMxPRICE.png', dpi=400)

plt.scatter(bos.PTRATIO, bos.PRICE)
plt.xlabel("Pupil-to-Teacher Ratio (PTRATIO)")
plt.ylabel("Housing Price")
plt.title("Relationship between PTRATIO and Price")
plt.savefig('./PTRATIOxPRICE.png', dpi=400)

# We drop the price from the original dataset as it is the target
X = data.drop('PRICE', axis = 1)

# Train model 

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

model = Ridge()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}

ridge_regressor = GridSearchCV(model, parameters, scoring = 'neg_mean_squared_error', cv = 5)

ridge_regressor.fit(X, data.PRICE)


# Checking for the error( in this case, Mean Squared Error)

MSE = mean_squared_error(data.PRICE, model.predict(X))
print MSE
def get_stocks(st):

    # Find one record of data from the mongo database
    # @TODO: YOUR CODE HERE!

    session = Session(engine)
    stocks = session.execute("select * from stocks ")
    #return render_template("index.html", listings=listings)
    # Return template and data

    resdata = [{}]

    responsedata = {'respdata': resdata}
    session.close()

    print('Hello this is test')
    df = pd.read_csv("static/data/" + st + ".csv")
    # Drop the null columns where all values are null
    df = df.dropna(axis='columns', how='all')
    # Drop the null rows
    # This is for the MinMax Linear Regression model
    print(df.head())
    df = df.dropna()
    print(df.head())
    y = df["Open"].values.reshape(-1, 1)
    diff = df['Close'] - df["Open"]
    diff_locations = []
    for i in diff:
        if (i < 0):
            diff_locations.append(0)
        else:
            diff_locations.append(1)
    df['diff'] = pd.DataFrame(diff_locations)
    #X = df[['High', 'Low', 'Close', 'Volume','diff']]
    X = df[['High', 'Low', 'Close', 'Volume', 'diff']]
    print(X)
    print(y)
    print(X.shape, y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    X_minmax = MinMaxScaler().fit(X_train)
    y_minmax = MinMaxScaler().fit(y_train)

    X_train_minmax = X_minmax.transform(X_train)
    X_test_minmax = X_minmax.transform(X_test)
    y_train_minmax = y_minmax.transform(y_train)
    y_test_minmax = y_minmax.transform(y_test)
    model2 = LinearRegression()
    model2.fit(X_train_minmax, y_train_minmax)
    print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}")
    minmax_predict = model2.score(X_test_minmax, y_test_minmax)
    print(minmax_predict)

    #This is standard scalar transformation
    X_scaler = StandardScaler().fit(X_train)
    y_scaler = StandardScaler().fit(y_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    y_train_scaled = y_scaler.transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)
    predictions = model.predict(X_test_scaled)
    scallar_MSE = mean_squared_error(y_test_scaled, predictions)
    scallar_r2 = model.score(X_test_scaled, y_test_scaled)
    plt.scatter(model.predict(X_train_scaled),
                model.predict(X_train_scaled) - y_train_scaled,
                c="blue",
                label="Training Data")
    plt.scatter(model.predict(X_test_scaled),
                model.predict(X_test_scaled) - y_test_scaled,
                c="orange",
                label="Testing Data")
    #plt.legend()
    plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
    plt.title("Residual Plot")
    #plt.show()
    pwd = os.getcwd()
    print(pwd)
    #p = Path(os.getcwd()+"\static\images")
    plt.savefig("static/images/" + st + ".png")
    f = open("static/images/" + st + ".png")
    plt.close()
    f.close()

    #Lasso model
    ### BEGIN SOLUTION
    lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

    lasso_predictions = lasso.predict(X_test_scaled)

    lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions)
    lasso_r2 = lasso.score(X_test_scaled, y_test_scaled)
    ### END SOLUTION

    print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}")

    #Ridge model
    ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

    ridge_predictions = ridgeVal.predict(X_test_scaled)

    ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions)
    ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled)
    print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}")

    #elasticNet
    elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

    elasticnet_predictions = elasticnet.predict(X_test_scaled)

    elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions)
    elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled)
    print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}")

    fig1 = plt.figure(figsize=(12, 6))
    axes1 = fig1.add_subplot(1, 2, 1)
    axes2 = fig1.add_subplot(1, 2, 2)

    axes1.set_title("Original Data")
    axes2.set_title("Scaled Data")

    maxx = X_train["High"].max()
    maxy = y_train.max()
    axes1.set_xlim(-maxx + 1, maxx + 1)
    axes1.set_ylim(-maxy + 1, maxy + 1)

    axes2.set_xlim(-2, 2)
    axes2.set_ylim(-2, 2)
    set_axes(axes1)
    set_axes(axes2)

    axes1.scatter(X_train["High"], y_train)
    axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:])

    p = Path(os.getcwd() + "/static/images")
    #q = p / "axes2"+st+".png"
    #if (q.exists()):
    fig1.savefig("static/images/axes2" + st + ".png")
    f = open("static/images/axes2" + st + ".png")
    plt.close()
    f.close()
    #else:
    #    fig1.savefig("static/images/axes2"+st+".png")
    #    plt.close()

    return render_template("indexStocks.html",
                           stocks=stocks,
                           responsedata=responsedata,
                           init_page="initpage",
                           sel_stk=st,
                           minmax_predict=minmax_predict,
                           scallar_MSE=scallar_MSE,
                           scallar_r2=scallar_r2,
                           lasso_MSE=lasso_MSE,
                           lasso_r2=lasso_r2,
                           ridge_MSE=ridge_MSE,
                           ridge_r2=ridge_r2,
                           elasticnet_MSE=elasticnet_MSE,
                           elasticnet_r2=elasticnet_r2)
mean_cols = all_dummy_df.mean()
all_dummy_df = all_dummy_df.fillna(mean_cols)
numeric_cols = all_df.columns[all_df.dtypes!="object"]
numeric_cols_means = all_dummy_df.loc[:,numeric_cols].mean()
numeric_col_std = all_dummy_df.loc[:,numeric_cols].std()
all_dummy_df.loc[:,numeric_cols] = (all_dummy_df.loc[:,numeric_cols]-numeric_cols_means)/numeric_col_std
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
X_train = dummy_train_df.values
X_test = dummy_test_df.values
alphas = np.logspace(-3,2,50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv=10,scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
# %matplotlib inline
# print(np.array(test_scores).shape)
plt.plot(alphas,test_scores)
plt.title("Alpha vs Error")
plt.show()
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
from sklearn.model_selection import GridSearchCV

if __name__ == "__main__":
    # pandas读入
    data = pd.read_csv('10.Advertising.csv')  # TV、Radio、Newspaper、Sales
    # x = data[['TV', 'Radio', 'Newspaper']]
    x = data[['TV', 'Radio']]
    y = data['Sales']
    # print x
    # print y

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=2017)
    # model = Lasso()
    model = Ridge()  #貌似ridge MSE小一点
    alpha_can = np.logspace(-3, 2, 10)
    lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)
    lasso_model.fit(x_train, y_train)
    print '超参数:', lasso_model.best_params_

    y_hat = lasso_model.predict(np.array(x_test))
    print '参数最佳评分:', lasso_model.score(x_test, y_test)
    mse = np.average((y_hat - np.array(y_test))**2)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print mse, rmse

    # t = np.arange(len(x_test))
    # mpl.rcParams['font.sans-serif'] = [u'simHei']
    # mpl.rcParams['axes.unicode_minus'] = False
    # plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据')
def upload_get_stocks(st):

    # Find one record of data from the mongo database
    # @TODO: YOUR CODE HERE!

    #cr = csv.reader(open("https://query1.finance.yahoo.com/v7/finance/download/"+st+"?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true","rb"))

    #data = pd.read_csv('https://example.com/passkey=wedsmdjsjmdd')

    #df = pd.read_csv("static/data/"+st+".csv")

    #with open("static/data/"+st+".csv", "wt") as fp:
    #    writer = csv.writer(fp)
    #    # writer.writerow(["your", "header", "foo"])  # write header
    #    writer.writerows(data)

    #dateval = datetime.date.strtime("%D")
    #print(dateval)
    session = Session(engine)
    stock = session.execute("select * from stocks where symbol='" + st + "'")
    #return render_template("index.html", listings=listings)
    # Return template and data

    if (stock.rowcount == 0):
        data = pd.read_csv(
            "https://query1.finance.yahoo.com/v7/finance/download/" + st +
            "?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true",
            sep=',')

        data.to_csv("static/data/" + st + ".csv", index=False, header=True)

        print(data)
        session.execute("INSERT INTO stocks VALUES ('" + st + "', '" + st +
                        " Corp')")
        session.execute("commit")

    stocks = session.execute("select * from stocks")

    resdata = [{}]

    responsedata = {'respdata': resdata}
    session.close()

    print('Hello this is test')
    data = pd.read_csv("static/data/" + st + ".csv")
    df = data
    # Drop the null columns where all values are null
    df = df.dropna(axis='columns', how='all')
    # Drop the null rows
    # This is for the MinMax Linear Regression model
    print(df.head())
    df = df.dropna()
    print(df.head())
    y = df["Open"].values.reshape(-1, 1)
    diff = df['Close'] - df["Open"]
    diff_locations = []
    for i in diff:
        if (i < 0):
            diff_locations.append(0)
        else:
            diff_locations.append(1)
    df['diff'] = pd.DataFrame(diff_locations)
    #X = df[['High', 'Low', 'Close', 'Volume','diff']]
    X = df[['High', 'Low', 'Close', 'Volume', 'diff']]
    print(X)
    print(y)
    print(X.shape, y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    X_minmax = MinMaxScaler().fit(X_train)
    y_minmax = MinMaxScaler().fit(y_train)

    X_train_minmax = X_minmax.transform(X_train)
    X_test_minmax = X_minmax.transform(X_test)
    y_train_minmax = y_minmax.transform(y_train)
    y_test_minmax = y_minmax.transform(y_test)
    model2 = LinearRegression()
    model2.fit(X_train_minmax, y_train_minmax)
    print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}")
    minmax_predict = model2.score(X_test_minmax, y_test_minmax)
    print(minmax_predict)

    #This is standard scalar transformation
    X_scaler = StandardScaler().fit(X_train)
    y_scaler = StandardScaler().fit(y_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    y_train_scaled = y_scaler.transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)
    predictions = model.predict(X_test_scaled)
    scallar_MSE = mean_squared_error(y_test_scaled, predictions)
    scallar_r2 = model.score(X_test_scaled, y_test_scaled)
    plt.scatter(model.predict(X_train_scaled),
                model.predict(X_train_scaled) - y_train_scaled,
                c="blue",
                label="Training Data")
    plt.scatter(model.predict(X_test_scaled),
                model.predict(X_test_scaled) - y_test_scaled,
                c="orange",
                label="Testing Data")
    #plt.legend()
    plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
    plt.title("Residual Plot")
    #plt.show()
    pwd = os.getcwd()
    print(pwd)
    #p = Path(os.getcwd()+"\static\images")
    plt.savefig("static/images/" + st + ".png")
    f = open("static/images/" + st + ".png")
    plt.close()
    f.close()

    #Lasso model
    ### BEGIN SOLUTION
    lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

    lasso_predictions = lasso.predict(X_test_scaled)

    lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions)
    lasso_r2 = lasso.score(X_test_scaled, y_test_scaled)
    ### END SOLUTION

    print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}")

    #Ridge model
    ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

    ridge_predictions = ridgeVal.predict(X_test_scaled)

    ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions)
    ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled)
    print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}")

    #elasticNet
    elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

    elasticnet_predictions = elasticnet.predict(X_test_scaled)

    elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions)
    elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled)
    print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}")

    fig1 = plt.figure(figsize=(12, 6))
    axes1 = fig1.add_subplot(1, 2, 1)
    axes2 = fig1.add_subplot(1, 2, 2)

    axes1.set_title("Original Data")
    axes2.set_title("Scaled Data")

    maxx = X_train["High"].max()
    maxy = y_train.max()
    axes1.set_xlim(-maxx + 1, maxx + 1)
    axes1.set_ylim(-maxy + 1, maxy + 1)

    axes2.set_xlim(-2, 2)
    axes2.set_ylim(-2, 2)
    set_axes(axes1)
    set_axes(axes2)

    axes1.scatter(X_train["High"], y_train)
    axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:])

    p = Path(os.getcwd() + "/static/images")
    #q = p / "axes2"+st+".png"
    #if (q.exists()):
    fig1.savefig("static/images/axes2" + st + ".png")
    f = open("static/images/axes2" + st + ".png")
    plt.close()
    f.close()
    #else:
    #    fig1.savefig("static/images/axes2"+st+".png")
    #    plt.close()

    return render_template("indexStocks.html",
                           stocks=stocks,
                           responsedata=responsedata,
                           init_page="initpage",
                           sel_stk=st,
                           minmax_predict=minmax_predict,
                           scallar_MSE=scallar_MSE,
                           scallar_r2=scallar_r2,
                           lasso_MSE=lasso_MSE,
                           lasso_r2=lasso_r2,
                           ridge_MSE=ridge_MSE,
                           ridge_r2=ridge_r2,
                           elasticnet_MSE=elasticnet_MSE,
                           elasticnet_r2=elasticnet_r2)
Ejemplo n.º 43
0
变量编码:
类别标签:LabelEncoder
无序特征编码:OneHotEncoder
有序离散特征编码:OrdinalEncoder
'''
titanic = titanic[['Sex', 'Age', 'Embarked','Pclass', 'Survived']]
titanic.head()

enc = OneHotEncoder(categories = 'auto')
oneHot_ret = enc.fit_transform(titanic[['Sex','Embarked']]).toarray()

print(enc.get_feature_names())

titanic_new = pd.concat([titanic, pd.DataFrame(oneHot_ret, columns = enc.get_feature_names())], axis = 'columns').drop(['Sex', 'Embarked'], axis='columns')
titanic_new


x = pd.DataFrame(np.random.uniform(1,20,20).reshape((-1,2)), columns = ['x','y'])
bin = Binarizer(threshold = 10)
x_bin = bin.fit_transform(x)

kbd = KBinsDiscretizer(n_bins = 5, encode='onehot-dense', strategy='kmeans')
x_kbd = kbd.fit_transform(x)
print(x_kbd)
kbd.n_bins_
kbd.bin_edges_


from sklearn.linear_model import Ridge, Lasso, ElasticNet
Ridge()
        break
    model_list.append(model_n)
    model_collection = np.vstack((model_collection, X_tr[:, model_n]))

    print model_list
    print cur_mn

print len(model_list)

#choose top12 models
model_list2 = model_list[0:12]
test_fin = test[model_list2]
train_fin = train[model_list2]

#select model for stacking
clf = Ridge(alpha=3.0)
clf.fit(train_fin, y)

pred1 = clf.predict(test_fin)
pred1[pred1 < 1.] = 1.
pred1[pred1 > 3.] = 3.

#saved_results
pd.DataFrame({
    "id": id_test,
    "relevance": pred1
}).to_csv(MODELS_DIR + "/submissions_ensemble_n_models_from_m_11_04_2016.csv",
          index=False)

#X_new=train_fin
#import statsmodels.api as sm
Ejemplo n.º 45
0
            f.write(text_id + "," + top_imgs_str + "\n")
            if img_id in list(top_imgs):
                count += 1
        print("count", count)


train = pickle.load(open('train.pkl', 'rb'))
test = pickle.load(open('test.pkl', 'rb'))

print(train['captions'][0])
print(train['tags'][0])

# print(train['image_2048'].shape)

print("=== fit model  ===")
ridge = Ridge()
t = time.time()
ridge.fit(train['image_2048'], train['captions'])
print(time.time() - t)

print("=== test training data ===")
predict_caps = ridge.predict(train['image_2048'])
print(predict_caps.shape)
ranking(train['captions'], predict_caps, train['image_id'],
        'training_test_answer.csv')

print("=== test testing data ===")
predict_caps = ridge.predict(test['image_2048'])
print(predict_caps.shape)
#ranking(predict_image_features, test['image_2048'], test['image_id'], 'answer.csv')
ranking(test['captions'], predict_caps, test['image_id'], 'answer.csv')
Ejemplo n.º 46
0
def baggingMyLasso(trainX, trainY, train_prediction_start, testX, testY, test_prediction_start,
                   look_ahead, bag_size=47, Nestimators=50, samp_size=0.95,
                   market=1,every=0,lookbackMethod=1, bagModels_times=50,
                   RandomLasso=False,modelCombination='lassoLasso',otherFeats=False,
                   top_ret_Feats=6, topFeats=3):
    '''

    :param modelCombination:
    Only true if Randomized Lasso
    'lassoLasso', 'lassoRidge', 'LassoSimpleLinear'
    :return:
    '''
    #if Random Lass0=True: we choose variables that appear at least in 40% of the models

    # use 47 of estimators to do prediction, repeat 50 times
    # computing the best alpha for lasso:
    #alphas = np.linspace(0.005, 1, 1000)

    model = LassoCV(cv=10, eps=0.0001, fit_intercept=True, normalize=False,
                    random_state=None).fit(trainX, trainY)
    print('Best Lasso alpha for lookahead %d: ' % lookahead, model.alpha_)

    if RandomLasso:
        svr = BaggingRegressor(Lasso(alpha=model.alpha_,
                                     fit_intercept=True,
                                     normalize=False,
                                     random_state=None),
                               n_estimators=200,
                               max_samples=0.95,
                               bootstrap=False,
                               random_state=None,
                               n_jobs=-1)
        svr = svr.fit(trainX, trainY)
        ###################################
        coef_matrix = np.zeros((Nestimators, trainX.shape[1]))
        for i in range(Nestimators):
            coef_matrix[i, :] = \
                [1 if svr.estimators_[i].coef_[j] != 0 else 0 for j in range(trainX.shape[1])]

        coef_freq = np.sum(coef_matrix, axis=0) / Nestimators
        #num_coefs = trainX.shape[1] + 1
        # plt.bar(np.arange(1, num_coefs), coef_freq, color='skyblue')
        # plt.ylabel('Freq selected')
        # plt.xlabel('Feature')
        # plt.ylim(0, 1.3)
        # plt.text(2, 1.1, 'lookahead %d' % lookahead)
        # plt.xlim(0, num_coefs)
        # plt.show()
        if not otherFeats:

            ret_feat_inds=[i for i in range(len(coef_freq)) if coef_freq[i]>=0.60]
            print('Number of Features with more than 60 percent frequency: ', len(ret_feat_inds))
            if len(ret_feat_inds) >= top_ret_Feats:
                trainX = trainX[:, ret_feat_inds]
                testX = testX[:, ret_feat_inds]
            else:
                print('No variable meets the frequency requirement! choosing the %d most frequent!'%topFeats)
                order = coef_freq.argsort()
                trainX = trainX[:, order[-top_ret_Feats:]]
                testX = testX[:, order[-top_ret_Feats:]]

        else:
            ret_feat_freqs = coef_freq[:lookback]
            otherFeat_freqs=coef_freq[lookback:]
            o_F_inds=np.arange(lookback,trainX.shape[1],1)

            ret_feat_inds = np.array([i for i in range(len(ret_feat_freqs)) if ret_feat_freqs[i] >= 0.60])

            other_feat_inds = np.array(
                [i for i in range(lookback,len(coef_freq),1) if coef_freq[i] >= 0.60])

            print('Number of return Features with more than 60 percent frequency: ', len(ret_feat_inds))
            print('Number of Other Features with more than 60 percent frequency: ', len(other_feat_inds))
            if len(ret_feat_inds) < top_ret_Feats:
                ret_order = ret_feat_freqs.argsort()
                ret_order=ret_order[-top_ret_Feats:]
            else:
                ret_order=ret_feat_inds
            print(ret_order, 'return feat indices')

            if len(other_feat_inds)<topFeats:
                otherFeat_order = otherFeat_freqs.argsort()
                o_F_inds=o_F_inds[otherFeat_order]
                otherFeat_order = o_F_inds[-topFeats:]
            else:
                otherFeat_order=other_feat_inds
            print(otherFeat_order, 'other feat indices')

            ret_order=list(ret_order)
            ret_order.extend(list(otherFeat_order))
            trainX = trainX[:, ret_order]
            testX = testX[:, ret_order]

        print(trainX.shape,testX.shape)
        if modelCombination=='lassoLasso':
            #alphas = np.linspace(0, 1, 200)
            model = LassoCV(cv=10, eps=0.0001,fit_intercept=True, normalize=False,
                            random_state=None).fit(trainX, trainY)
            print('Best alpha for lookahead %d: after feature Selection!' % lookahead, model.alpha_)
            svr = BaggingRegressor(Lasso(alpha=model.alpha_, fit_intercept=True, normalize=False,
                                 random_state=None), n_estimators=50,
                           max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1)
        elif modelCombination=='LassoSimpleLinear':
            svr = BaggingRegressor(LinearRegression(fit_intercept=True, normalize=False), n_estimators=50,
                               max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1)

        elif modelCombination=='LassoRidge':

            alphas = np.linspace(0.0001, 10, 200)
            #cv=None: a form of leave one out CV!
            model_CV = RidgeCV(cv=None, alphas=alphas, fit_intercept=True,
                               normalize=False).fit(trainX, trainY)
            print('Best Ridge alpha for lookahead %d: after feature Selection- Lasso Ridge!' %
                  lookahead, model_CV.alpha_)
            svr = BaggingRegressor(Ridge(alpha=model_CV.alpha_, normalize=False, fit_intercept=True),
                                     n_estimators=Nestimators,
                                     max_samples=samp_size, bootstrap=False, random_state=None, n_jobs=-1)

    else:
        svr = BaggingRegressor(Lasso(alpha=model.alpha_, fit_intercept=True, normalize=False,
                                     random_state=None), n_estimators=50,
                               max_samples=0.95, bootstrap=False, random_state=None, n_jobs=-1)

    svr = svr.fit(trainX, trainY)
    ##
    coef_matrix = np.zeros((Nestimators, trainX.shape[1]))
    for i in range(Nestimators):
        coef_matrix[i, :] = [1 if svr.estimators_[i].coef_[j] != 0 else 0 for j in range(trainX.shape[1])]

    coef_freq = np.sum(coef_matrix, axis=0) / Nestimators
    num_coefs = trainX.shape[1] + 1
    # plt.bar(np.arange(1, num_coefs), coef_freq, color='skyblue')
    # plt.ylabel('Freq selected')
    # plt.xlabel('Feature')
    # plt.ylim(0, 1.3)
    # plt.text(2, 1.1, 'lookahead %d' % lookahead)
    # plt.xlim(0, num_coefs)
   # plt.show()
    ###
    colnamesRaw = ['dtStart']
    cln = [i for i in range(1, Nestimators * 2 + 3, 1)]
    colnamesRaw.extend(cln)
    colnamesBagged = ['dtStart']
    cln1 = [i for i in range(1, bagModels_times * 2 + 3, 1)]
    colnamesBagged.extend(cln1)

    # (date, trainY, true_lab, pred_labs...)
    trainRs = np.zeros((trainX.shape[0], bagModels_times * 2 + 3))
    trainRs_raw = np.zeros((trainX.shape[0], Nestimators * 2 + 3))
    trainRs[:, 0] = train_prediction_start
    trainRs_raw[:, 0] = train_prediction_start
    trainRs[:, 1] = trainY
    trainRs_raw[:, 1] = trainY
    trainRs[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))]
    trainRs_raw[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))]
    #
    testRs = np.zeros((testX.shape[0], bagModels_times * 2 + 3))
    testRs_raw = np.zeros((testX.shape[0], Nestimators * 2 + 3))
    testRs[:, 0] = test_prediction_start
    testRs_raw[:, 0] = test_prediction_start
    testRs[:, 1] = testY
    testRs_raw[:, 1] = testY
    testRs[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))]
    testRs_raw[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))]
    for i in range(Nestimators):
        trainRs_raw[:, i + 3] = svr.estimators_[i].predict(trainX)

        testRs_raw[:, i + 3] = svr.estimators_[i].predict(testX)

        trainRs_raw[:, i + Nestimators + 3] = \
            [1 if trainRs_raw[j, i + 3] > 0 else 0 for j in range(len(trainRs_raw[:, i + 3]))]
        testRs_raw[:, i + Nestimators + 3] = \
            [1 if testRs_raw[j, i + 3] > 0 else 0 for j in range(len(testRs_raw[:, i + 3]))]

    # aggregating results!
    model_inds = [j for j in range(3, Nestimators + 3)]
    # print(model_inds)
    for i in range(bagModels_times):
        index_modelstoUse = np.random.choice(model_inds, bag_size, replace=False)
        tmp_train = trainRs_raw[:, index_modelstoUse]
        tmp_test = testRs_raw[:, index_modelstoUse]
        trainRs[:, i + 3] = np.sum(tmp_train, axis=1)
        testRs[:, i + 3] = np.sum(tmp_test, axis=1)

        trainRs[:, i + bagModels_times + 3] = \
            [1 if trainRs[j, i + 3] > 0 else 0 for j in range(len(trainRs[:, i + 3]))]
        testRs[:, i + bagModels_times + 3] = \
            [1 if testRs[j, i + 3] > 0 else 0 for j in range(len(testRs[:, i + 3]))]

    trainRs = pd.DataFrame(trainRs, columns=colnamesBagged)
    trainRs_raw = pd.DataFrame(trainRs_raw, columns=colnamesRaw)
    testRs = pd.DataFrame(testRs, columns=colnamesBagged)
    testRs_raw = pd.DataFrame(testRs_raw, columns=colnamesRaw)

    if every == 0:
        trainRs.to_csv('%d_%d_%d_Lasso_train.csv' % (market, lookbackMethod, look_ahead), index=False)
        trainRs_raw.to_csv('%d_%d_%d_Lasso_train_Raw.csv' % (market, lookbackMethod, look_ahead), index=False)
        testRs.to_csv('%d_%d_%d_Lasso_test.csv' % (market, lookbackMethod, look_ahead), index=False)
        testRs_raw.to_csv('%d_%d_%d_Lasso_test_Raw.csv' % (market, lookbackMethod, look_ahead), index=False)
    else:

        trainRs.to_csv('%d_%d_%d_every_%d_Lasso_train.csv' %
                       (market, lookbackMethod, look_ahead, every), index=False)
        trainRs_raw.to_csv('%d_%d_%d_every_%d_Lasso_train_Raw.csv' %
                           (market, lookbackMethod, look_ahead, every), index=False)
        testRs.to_csv('%d_%d_%d_every_%d_Lasso_test.csv' %
                      (market, lookbackMethod, look_ahead, every), index=False)
        testRs_raw.to_csv('%d_%d_%d_every_%d_Lasso_test_Raw.csv' %
                          (market, lookbackMethod, look_ahead, every), index=False)


    return coef_matrix, coef_freq
Ejemplo n.º 47
0
# --------------
from sklearn.linear_model import Lasso

# Code starts here
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

r2_lasso = r2_score(y_test, lasso_pred)
print("r2_lasso", r2_lasso)

# --------------
from sklearn.linear_model import Ridge

# Code starts here
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

r2_ridge = r2_score(y_test, ridge_pred)
print("r2_ridge", r2_ridge)

# Code ends here

# --------------
from sklearn.model_selection import cross_val_score

#Code starts here
regressor = LinearRegression()

score = cross_val_score(regressor, X_train, y_train, cv=10)
Ejemplo n.º 48
0
 def construct_model(self, param, ifbest_params=0):
     model_name = self.model_name
     if model_name == "randomforest":
         int_params = [
             'n_estimators', 'max_depth', 'min_samples_split',
             'min_samples_leaf'
         ]
         for item in int_params:
             param[item] = int(param[item])
         self.log.add(param, 1)
         model = RandomForestRegressor(
             n_estimators=param['n_estimators'],
             max_depth=param['max_depth'],
             min_samples_split=param['min_samples_split'],
             min_samples_leaf=param['min_samples_leaf'])
     elif model_name == 'gbregressor':
         int_params = [
             'n_estimators', 'max_depth', 'min_samples_split',
             'min_samples_leaf'
         ]
         for item in int_params:
             param[item] = int(param[item])
         self.log.add(param, 1)
         model = GradientBoostingRegressor(
             learning_rate=param['learning_rate'],
             n_estimators=param['n_estimators'],
             max_depth=param['max_depth'],
             subsample=param['subsample'],
             min_samples_split=param['min_samples_split'],
             min_samples_leaf=param['min_samples_leaf'])
     elif model_name == 'xgbregressor':
         int_params = ['max_depth', 'num_round']
         for item in int_params:
             param[item] = int(param[item])
         self.log.add(param, 1)
         model = XGBRegressor(n_estimators=param['num_round'],
                              objective=param_xgb_space['objective'],
                              learning_rate=param['eta'],
                              gamma=param['gamma'],
                              min_child_weight=param['min_child_weight'],
                              max_depth=param['max_depth'],
                              subsample=param['subsample'],
                              colsample_bytree=param['colsample_bytree'],
                              seed=param_xgb_space['seed'],
                              nthread=param_xgb_space['nthread'])
     elif model_name == 'lasso':
         model = Lasso(alpha=param['alpha'],
                       random_state=param_lasso_space['random_state'])
     elif model_name == 'ridge':
         model = Ridge(alpha=param['alpha'],
                       random_state=param_ridge_space['random_state'])
     elif model_name == 'svr':
         if ifbest_params == 0:
             model = SVR(C=param['C'],
                         gamma=param['gamma'],
                         degree=param['degree'],
                         epsilon=param['epsilon'],
                         kernel=param['kernel'])
         else:
             if (param['kernel'] == 0):
                 cur_kernel = 'rbf'
             else:
                 assert param['kernel'] == 1
                 cur_kernel = 'poly'
             model = SVR(C=param['C'],
                         gamma=param['gamma'],
                         degree=param['degree'],
                         epsilon=param['epsilon'],
                         kernel=cur_kernel)
     return model
Ejemplo n.º 49
0
def main():
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=False)

    # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
    # within the classes ar their definition if using custom classes, or also it could be defined after declaring the
    # pipeline using a flat dict or a nested dict.

    p = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
        ]),
        ModelStacking(
            [
                SKLearnWrapper(
                    GradientBoostingRegressor(),
                    HyperparameterSpace({
                        "n_estimators": RandInt(50, 600),
                        "max_depth": RandInt(1, 10),
                        "learning_rate": LogUniform(0.07, 0.7)
                    })),
                SKLearnWrapper(
                    KMeans(),
                    HyperparameterSpace({"n_clusters": RandInt(5, 10)})),
            ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({
                    "alpha": LogUniform(0.7, 1.4),
                    "fit_intercept": Boolean()
                })),
        )
    ])
    print("Meta-fitting on train:")
    p = p.meta_fit(X_train,
                   y_train,
                   metastep=RandomSearch(
                       n_iter=10,
                       higher_score_is_better=True,
                       validation_technique=KFoldCrossValidation(
                           scoring_function=r2_score, k_fold=10)))
    # Here is an alternative way to do it, more "pipeliney":
    # p = RandomSearch(
    #     p,
    #     n_iter=15,
    #     higher_score_is_better=True,
    #     validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3)
    # ).fit(X_train, y_train)

    print("")

    print("Transforming train and test:")
    y_train_predicted = p.transform(X_train)
    y_test_predicted = p.transform(X_test)

    print("")

    print("Evaluating transformed train:")
    score_transform = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_transform)

    print("")

    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)
Ejemplo n.º 50
0
# Grid Search for Algorithm Tuning
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# load the diabetes datasets
dataset = datasets.load_diabetes()
# prepare a range of alpha values to test
alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(dataset.data, dataset.target)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# # Ridge Regression

# In[41]:

#ridge regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# In[44]:

ridge = Ridge()
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-2, 1, 5, 10, 20, 30, 35, 40]}
ridge_regressor = GridSearchCV(ridge,
                               parameters,
                               scoring='neg_mean_squared_error',
                               cv=5)
ridge_regressor.fit(X_train, y_train)

# In[45]:

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

# In[46]:

predictions = ridge_regressor.predict(X_test)
def LinearRegression(data1, y):
    X_train, X_test, y_train, y_test = train_test_split(data1,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=Hcurstate)

    X_train_new = X_train.reset_index(drop=True)
    y_train_new = y_train.reset_index(drop=True)

    X_train_new = X_train_new.values
    y_train_new = y_train_new.values

    k = 5
    kf = KFold(n_splits=k, random_state=Hcurstate)
    avg_train_acc, avg_test_acc = 0, 0

    val_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]

    avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
    avgsc, avgsc_train, avgsc_hld = 0, 0, 0

    for train_index, test_index in kf.split(X_train_new):
        X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[
            test_index]
        y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[
            test_index]
        X_train_train, X_val, y_train_train, y_val = train_test_split(
            X_train_cur, y_train_cur, test_size=0.25, random_state=Hcurstate)

        # print(X_train_train.shape)
        # print(X_val.shape)

        bestPerformingModel = Ridge(alpha=1.0, random_state=Hcurstate)
        bestscore = maxintval

        for val in val_arr:
            clf = Ridge(alpha=val, random_state=Hcurstate)
            clf = clf.fit(X_train_train, y_train_train)
            y_pred = clf.predict(X_val)
            sc = sqrt(mean_squared_error(y_pred, y_val))
            if bestscore > sc:
                bestscore = sc
                bestPerformingModel = clf

        y_pred = bestPerformingModel.predict(X_train_cur)
        bscr_train = sqrt(mean_squared_error(y_pred, y_train_cur))

        y_pred = bestPerformingModel.predict(X_test_cur)
        bscr = sqrt(mean_squared_error(y_pred, y_test_cur))

        y_pred = bestPerformingModel.predict(X_test)
        bscr_hld = sqrt(mean_squared_error(y_pred, y_test))

        avgsc_train_lst.append(bscr_train)
        avgsc_lst.append(bscr)
        avgsc_hld_lst.append(bscr_hld)

        avgsc_train = avgsc_train + bscr_train
        avgsc = avgsc + bscr
        avgsc_hld = avgsc_hld + bscr_hld

        # print(bscr_train)
        # print(bscr)
        # print(bscr_hld)

    print('5-fold Train, Validation, and Test loss:')
    print(avgsc_train_lst)
    print(avgsc_lst)
    print(avgsc_hld_lst)

    print('Avg Train, Validation, and Test loss:')
    print(avgsc_train / k)
    print(avgsc / k)
    print(avgsc_hld / k)

    return avgsc_train_lst, avgsc_lst, avgsc_hld_lst
Ejemplo n.º 53
0
preprocessor = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), categorical_columns),
    remainder='passthrough')

##############################################################################
# To describe the dataset as a linear model we use a ridge regressor
# with a very small regularization and to model the logarithm of the WAGE.

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(regressor=Ridge(alpha=1e-10),
                               func=np.log10,
                               inverse_func=sp.special.exp10))

##############################################################################
# Processing the dataset
# ----------------------
#
# First, we fit the model.

_ = model.fit(X_train, y_train)

##############################################################################
# Then we check the performance of the computed model plotting its predictions
# on the test set and computing,
# for example, the median absolute error of the model.
Ejemplo n.º 54
0
def main():
    np.random.seed(42)

    x = np.arange(0, np.math.pi, 0.01)
    err = np.random.randn(len(x))
    y = np.sin(x) + (x / 3)**2 + 0.1 * err
    #y = np.sin(x) + 0.1 * err

    #X = np.matrix(np.column_stack([x ** i for i in range(2, 5)] + [np.sin(x)]))
    #X = np.matrix(np.column_stack([x ** i for i in range(2, 6)]))
    X = np.matrix(np.column_stack([phi(j / 10, 0.1, x) for j in range(31)]))
    Xb = sm.add_constant(X)
    Y = np.matrix(y).T
    X_s = standardize(X)

    # ==============================================================================
    # OLS, Ridge, Lasso
    # ==============================================================================
    ols = OLSEstimator(Xb, Y)
    ols.estimate()
    ols.test()

    ridge = Ridge(alpha=10).fit(X, Y)
    y_ridge = ridge.predict(X)

    lasso = Lasso(alpha=1e-2).fit(X, Y)
    y_lasso = lasso.predict(X)

    # ==============================================================================
    # PCR
    # ==============================================================================
    u, d, vt = np.linalg.svd(X_s, full_matrices=False)
    print(d)
    v = vt.T
    M = 31
    p = v.shape[1]
    print(p)
    z = []
    theta = []
    s = np.zeros((Y.shape[0], 1))
    Y_c = center(Y)

    for i in range(M):
        z_m = X_s * v[:, i]  # N x 1
        z.append(z_m)
        theta_m = (z_m.T * Y_c / (z_m.T * z_m))[0, 0]
        theta.append(theta_m)
        s += theta_m * z_m

    y_pcr = s + Y.mean(axis=0)

    # ==============================================================================
    # Plot
    # ==============================================================================
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')
    # Prepare Plot
    plt.figure(figsize=(10, 6), dpi=300)
    plt.title(r"Gaussian OLS, Ridge, Lasso \& PCR", fontsize=16)
    plt.xlabel(r'$x$', fontsize=14)
    plt.ylabel(r'$y$', fontsize=14)

    # Plot with Legends
    plt.scatter(x, y, color="blue", alpha=0.1, label=r'Data')
    plt.plot(x,
             np.asarray(ols.y_hat).ravel(),
             color='r',
             alpha=0.7,
             label=r'OLS')
    plt.plot(x, y_ridge, color='g', alpha=0.7, label=r'Ridge')
    plt.plot(x, y_lasso, color='purple', alpha=0.7, label=r'Lasso')
    plt.plot(x, y_pcr, color='black', label=r'PCR(' + str(M) + ')')

    # Other options
    plt.legend(fontsize=12)
    plt.savefig("pcr_gaussian_" + str(M) + ".png", dpi=300)
Ejemplo n.º 55
0
coefficient_picks("linear_regr")
regression_results(y_test, y_pred, "linear_regr")



df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).apply(np.exp)
print("Linear", df)


model = sm.OLS(y_test, X_test).fit()
print(model.summary())
# model = sm.OLS(y_train, X_train[:]).fit()
# MSEs = cross_val_score(linear_regr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

# ------------------------Ridge------------------------------------------------------
ridge = Ridge()
parameters = {'alpha': [1e-14, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regr = Ridge(alpha=0.001, normalize=True, tol=1)
ridge_regr.fit(X_train, y_train)

y_pred = ridge_regr.predict(X_test)

coefficient_picks("ridge_regr")
regression_results(y_test, y_pred, "ridge_regr")

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).apply(np.exp)
print("Ridge ", df)

#ridge_regr = GridSearchCV(ridge, parameters, scoring = 'mean_squared_error', cv=5)
#print("Ridge best params = ", ridge_regr.best_params_)
#print("Ridge best score = ", ridge_regr.best_score_)
Ejemplo n.º 56
0
rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true)))
rmse, mse

plt.scatter(y_true, y_hat, s=10)
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title


# Linear Regression with Ridge & Lasso regression
# 1. 라쏘, 릿지 모듈불러오기
from sklearn.linear_model import Lasso, Ridge
# X_train, X_test, y_train, y_test 데이타셋 나누기, 테스트크기 33%, 랜덤스테이트 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# 릿지 regression 하기. 절편 있고 alpha 0.5(?)
ridge = Ridge(fit_intercept=True, alpha=0.5)
# 트레인 데이터로 피팅하기.
ridge.fit(X_train,y_train)
# ridge로 predict해서 y_hat에 결과값 저장.
y_hat = ridge.predict(X_test)
y_true = y_test
# sklearn 내장 mse함수써서 mse에 저장.
mse = sklearn.metrics.mean_squared_error(y_hat, y_true)
# rmse를 식에 그대로 대입해서 rmse에 저장.
rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true)))
# 출력
rmse, mse

# scatter graph로 출력. xlabel, ylabel 지정하고 title넣기.
plt.scatter(y_true, y_hat, s=10)
plt.xlabel("Prices: $Y_i$")
Ejemplo n.º 57
0
def ridge(X, y, alpha=10):

    ridge_01 = Ridge(alpha=alpha).fit(X, y)

    return ridge_01
Ejemplo n.º 58
0
def ridge_model():
    from sklearn.linear_model import Ridge
    parameter = ' Elastic_model '
    model = Ridge(alpha=0.01)  #0.5
    return model, parameter
Ejemplo n.º 59
0
# In[8]:

from sklearn.model_selection import GridSearchCV

alphas = np.linspace(-1, 1, 200)

lasso_grid = GridSearchCV(Lasso(), param_grid={'alpha': alphas}, cv=3)
lasso_grid.fit(X_train, y)
best_lasso = lasso_grid.best_estimator_
print(lasso_grid.best_params_)

# In[9]:

ridge_alphas = np.linspace(1, 10, 200)
ridge_grid = GridSearchCV(Ridge(), param_grid={'alpha': alphas}, cv=3)
ridge_grid.fit(X_train, y)

best_ridge = ridge_grid.best_estimator_
print(ridge_grid.best_params_)

# In[10]:


def get_price_est_grid(model, X_train, X_test, y_train, alpha=0.1):
    price_model = model
    price_model.fit(X_train, y_train)
    y_hat = np.expm1(price_model.predict(X_test))
    return pd.DataFrame(y_hat, range(1461, 1461 + 1459), ['SalePrice'])

 
4) Median Income, by far, has the greatest strength of correlation with the target variable. 
Intuitively, we may feel that it is okay to assume the following relationship Higher Median Income => Higher Median House Price and
out intuition may indeed be correct, but it is important to remember that old statistical saying of "Correlation does
NOT imply causation"

5) Also, Average Rooms has some degree of correlation with the target variable(Potential multicollinearity - point 2 of the list))
and Latitude has a higher degree of correlation that Longitude. 

These are all of the relationships that we may want to think about and analyse both statistically and in our heads.
"""

# Creating the basic regression models for which gridsearch and cross validation will be used
linearReg = LinearRegression()
lassoReg = Lasso()
ridgeReg = Ridge()
elasticReg = ElasticNet()
parameters = {'alpha': np.linspace(0.1, 10, 50)}
paramElastic = {
    'alpha': np.linspace(0.1, 10, 50),
    'l1_ratio': np.linspace(0.01, 1, 10)
}
n_folds = 10
r2_scores = []
reg_names = ["Linear", "Lasso", "Ridge", "Elastic", "Catboost"]
results = {}
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=24)