def SGDRegressor_pred(X_train, X_test, y_train_normalized, y_train_mean, y_test): # The learning rate: # ---constant: eta = eta0 [assign to the initial one, eta0] # ---optimal: eta = 1.0/(t+t0) # ---invscaling: eta = eta0 / pow(t, power_t) [default] clf = SGDRegressor(alpha=0.0001, eta0=0.001, n_iter=150, fit_intercept=False, shuffle=True, verbose=0) clf = clf.fit(X_train, y_train_normalized) # Conveting to back, (could be used sklearn standardization function for both decoding and encoding) predictions_train = clf.predict(X_train) + y_train_mean predictions = clf.predict(X_test) + y_train_mean score_test = clf.score(X_test, y_test) return predictions, predictions_train, score_test
def predict(self, df): # get time frame time_frame = settings.time_frame # copy of data df_copy = df.copy() from sklearn.linear_model import SGDRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error # partition data X_train, y_train, X_val, y_val, X_test, y_test = self.partition(df_copy) # normalize features X_train_std, X_val_std, X_test_std = self.feature_scale(X_train, X_val, X_test) # instance of Linear Regression classifier lr = SGDRegressor() # fit model lr.fit(X_train_std, y_train) # predictions on validation set predictions = lr.predict(X_val_std) # R^2 score score = lr.score(X_val_std, y_val) # error test_error = (mean_squared_error(y_val, predictions)**.5) print test_error
def sgd(X, y, weight, X_test=False): from sklearn.linear_model import SGDRegressor from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( # X, y, weight, test_size=0.2, random_state=0) clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1") #clf = LogisticRegression( max_iter=100) X_train = X y_train = y scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight) print(clf.score(X_train,y_train,weight)) y_pred = clf.predict(X_test) from sklearn.externals import joblib import scipy.io as sio joblib.dump(clf, 'models/sgd_.pkl') sio.savemat('predict_y_forward.mat', {'y':y_pred})
def predict_age(): mask = ~np.isnan(train["Age"]) age_train = train[mask] age_test = train[~mask] features = [] features.append(embarked_enc.transform(age_train["Embarked"])) features.append(sex_enc.transform(age_train["Sex"])) features.append(title_enc.transform(age_train["Title"])) features.append(pclass_enc.transform(age_train["Pclass"])) age_clf = SGDRegressor() X = np.hstack(features) y = np.array(train["Age"][mask]).T age_clf.fit(X, y) features = [] features.append(embarked_enc.transform(age_test["Embarked"])) features.append(sex_enc.transform(age_test["Sex"])) features.append(title_enc.transform(age_test["Title"])) features.append(pclass_enc.transform(age_test["Pclass"])) ages = age_clf.predict(np.hstack(features)) j = 0 for i in range(len(train)): if ~mask[i]: train.loc[i, "Age"] = ages[j] j += 1
def predictScores(trainFeatures,trainTargets,testFeatures,testItemIds,isRegression = False): logging.info("Feature preparation done, fitting model...") predicted_scores = [] if isRegression: clf = SGDRegressor( penalty="l2", alpha=1e-4) print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict(testFeatures) else: clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(testFeatures).T[1] logging.info("Write results...") output_file = "avito_starter_solution.csv" logging.info("Writing submission to %s" % output_file) f = open(os.path.join(dataFolder,output_file), "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
class EdenRegressor(BaseEstimator, RegressorMixin): """Build a regressor for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """construct.""" self.set_params(r, d, nbits, discrete, normalization, inner_normalization, penalty, loss) def set_params(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.model = SGDRegressor( loss=loss, penalty=penalty, average=True, shuffle=True, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" return self.predict(graphs)
def gradiantDescent(trainData,testData,trainOuts,testOuts): clf = SGDRegressor(loss="squared_loss") print(clf.fit(trainData,trainOuts)) print(clf.coef_) predictions = clf.predict(testData) print(predictions) misses,error = sup.crunchTestResults(predictions,testOuts,.5) print(1-error)
def sgd(X_train, y_train, X_validate, y_validate, X_test, cw, alpha, regression=False): #cw = 2.5 if regression: clf = SGDRegressor(alpha=alpha) else: #clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha) clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha, loss='log') print clf training_data_size = y_train.shape[0] n_iter = 3 mb_size = 100 iter_mb = minibatch_generator(training_data_size, mb_size = mb_size, n_iter = n_iter) total = 0 n_total_batch = n_iter*training_data_size/mb_size t0 = time() recent_auc = [] for n_batch, batch in enumerate(iter_mb): x, y = X_train[batch], y_train[batch] if regression: sw = np.ones(y.shape[0]) sw[np.where(y==1)[0]] = cw clf.partial_fit(x, y, sample_weight=sw) else: clf.partial_fit(x, y, classes = [1, 0]) total += y.shape[0] if (n_batch+1)%1000 == 0: if regression: #y_pred_validate_val = clf.decision_function(X_validate) y_pred_validate_val = clf.predict(X_validate) else: #y_pred_validate_val = clf.decision_function(X_validate) y_pred_validate_val = clf.predict_proba(X_validate)[:,1] print 'auc:%.3f, %d samples in %ds (cw: %.2f)' %(AUC(y_validate, y_pred_validate_val), total, time()-t0, cw) if n_batch>n_total_batch-100: if regression: y_pred_validate_val = clf.predict(X_validate) else: y_pred_validate_val = clf.predict_proba(X_validate)[:,1] recent_auc.append(AUC(y_validate, y_pred_validate_val)) latest_auc_avg = np.mean(recent_auc) print 'cw=%.2f, avg auc of last %d bathes: %.3f' %(cw, len(recent_auc), latest_auc_avg) if regression: return clf.predict(X_test) else: return clf.predict_proba(X_test)[:,1]
def sgd_text_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False): if use_cache: fhand = open(cache_name, 'r') data_dict = pickle.load(fhand) return data_dict['test_pred'], data_dict['valid_pred'] np.random.seed(seed=123) model = SGDRegressor(eta0=1000, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='huber', n_iter=200, p=None, penalty='l1', power_t=.1, random_state=123, rho=None, shuffle=True, verbose=0, warm_start=False) model.fit(x_train, y_train) test_pred = model.predict(x_test) valid_pred = model.predict(x_valid) data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred} fhand = open(cache_name, 'w') pickle.dump(data_dict, fhand) fhand.close() return test_pred, valid_pred
def fit(self, U, Y): self.initialize() #learn X #X = self.getX(U,Y) X = self.getXBatched(U,Y,TSData.batchSize) print("Starting to train the model...") #clf = ElasticNet(alpha=5,l1_ratio=0.5,max_iter=50000) #for x1,y1 in izip(X,Y): # clf.partial_fit(x1[np.newaxis,:], y1) #If not using generator X = np.array([i for i in X]) #X = np.array(X) print(X.shape) print(Y.shape) clf = SGDRegressor(n_iter=100) clf.fit(X,np.ravel(Y)) print(metrics.mean_absolute_error(clf.predict(X),Y)) print(TSData().getScore(Y, clf.predict(X))) self.clf = clf
def test_multi_target_regression_partial_fit(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] references = np.zeros_like(y_test) half_index = 25 for n in range(3): sgr = SGDRegressor(random_state=0) sgr.partial_fit(X_train[:half_index], y_train[:half_index, n]) sgr.partial_fit(X_train[half_index:], y_train[half_index:, n]) references[:, n] = sgr.predict(X_test) sgr = MultiOutputRegressor(SGDRegressor(random_state=0)) sgr.partial_fit(X_train[:half_index], y_train[:half_index]) sgr.partial_fit(X_train[half_index:], y_train[half_index:]) y_pred = sgr.predict(X_test) assert_almost_equal(references, y_pred)
def sgd_regressor(x, y, alpha): kf = KFold(len(x), n_folds=3) scores = [] for train_index, test_index in kf: X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler() scaler.fit(X_train) x_train = scaler.transform(X_train) x_test = scaler.transform(X_test) clf = SGDRegressor(loss='squared_loss', alpha=alpha) clf.fit(x_train, y_train) scores.append(mean_squared_error(clf.predict(x_test), y_test) ** 0.5) # print 'SGDRegressor' return np.mean(scores)
def predictLinearRegress(attributeList, starTargetList): print("\nLinear Regression") starTargetList = np.array(starTargetList) Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75) lr = ml.linear.linearRegress(Xtrain, Ytrain) yHatInitial = lr.predict(Xtest) print("MSE test: ", mean_squared_error(yHatInitial, Ytest)) print("RMSE test: ", math.sqrt(mean_squared_error(yHatInitial, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHatInitial): if(abs(yHatInitial[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect)) onesCol = np.ones((len(Xtrain),1)) Xtrain = np.concatenate((onesCol, Xtrain), 1) onesCol = np.ones((len(Xtest),1)) Xtest = np.concatenate((onesCol, Xtest), 1) m, n = np.shape(Xtrain) clf = SGDRegressor(loss="squared_loss") clf.fit(Xtrain, Ytrain) yHat = clf.predict(Xtest) print("MSE after GD: ", mean_squared_error(yHat, Ytest)) print("RMSE after GD: ", math.sqrt(mean_squared_error(yHat, Ytest))) incorrect = 0 total = 0 for i, value in enumerate(yHat): if(abs(yHat[i] - Ytest[i]) > 0.5): incorrect += 1 total += 1 ratioIncorrect = float(float(incorrect) / float(total)) print("Ratio incorrect: " + str(ratioIncorrect))
def predictCrossValidatedScore(trainFeatures,trainTargets,trainItemIds,isRegression = False): logging.info("Feature preparation done, fitting model...") randomPermutation = random.sample(range(trainFeatures.shape[0]), trainFeatures.shape[0]) numPointsTrain = int(trainFeatures.shape[0]*0.5) dataTrainFeatures = trainFeatures[randomPermutation[:numPointsTrain]] dataValidationFeatures = trainFeatures[randomPermutation[numPointsTrain:]] dataTrainTargets = [trainTargets[i] for i in randomPermutation[:numPointsTrain]] dataValidationTargets = [trainTargets[i] for i in randomPermutation[numPointsTrain:]] predicted_scores = [] if isRegression: clf = SGDRegressor( penalty="l1", alpha=1e-4) print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(dataTrainFeatures,dataTrainTargets) logging.info("Predicting...") predicted_scores = clf.predict(dataValidationFeatures) else: clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") print("trainFeatures rows::"+str(trainFeatures.shape[0])) print("trainTargets rows::"+str(len(trainTargets))) clf.fit(dataTrainFeatures,dataTrainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(dataValidationFeatures).T[1] error = mean_squared_error(dataValidationTargets,predicted_scores) print("% Error:"+ str(error))
import pandas as pd from sklearn.preprocessing import Normalizer from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error df = pd.read_csv("forestfires.txt", index_col=False, sep=" ") X = df.iloc[:,0:-1].values Y = df.iloc[:,-1].values normalizer = Normalizer() X = normalizer.fit_transform(X) k_fold_cv = KFold(n=Y.shape[0], n_folds=10, shuffle=True) sgdr = SGDRegressor() for train_index, test_index in k_fold_cv: X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] sgdr.fit(X_train, Y_train) pred = sgdr.predict(X_test) error = mean_squared_error(Y_test, pred) print(error)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #ols from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X_train, y_train) #predicting the value y_pred1 = lin_reg.predict(X_test) #r2_score result from sklearn.metrics import r2_score, mean_squared_error r_squared1 = r2_score(y_test, y_pred1) print("Coefficient of Determination using ols method = ", r_squared1) #SGD from sklearn.linear_model import SGDRegressor, LinearRegression regressor = SGDRegressor(max_iter=10000, tol=1e-3) regressor.fit(X_train, y_train) #predicting the value y_pred = regressor.predict(X_test) #r2_score result from sklearn.metrics import r2_score, mean_squared_error r_squared = r2_score(y_test, y_pred) print("Coefficient of Determination using sgd method = ", r_squared)
def test_regressor_regularization(normalize, loss): rng = np.random.RandomState(0) transformer = RBFSampler(n_components=100, random_state=0, gamma=10) transformer.fit(X) X_trans = transformer.transform(X) if normalize: X_trans = StandardScaler().fit_transform(X_trans) y, coef = generate_target(X_trans, rng, -0.1, 0.1) y_train = y[:n_train] y_test = y[n_train:] # overfitting clf = AdamRegressor(transformer, max_iter=300, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=0.0001, intercept_decay=1e-6, random_state=0, tol=0, normalize=normalize) clf.fit(X_train[:100], y_train[:100]) l2 = np.mean((y_train[:100] - clf.predict(X_train[:100]))**2) assert l2 < 0.01 # underfitting clf_under = AdamRegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=100000, random_state=0, normalize=normalize) clf_under.fit(X_train, y_train) assert np.sum(clf_under.coef_**2) < np.sum(clf.coef_**2) # l1 regularization clf_l1 = AdamRegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=1000, l1_ratio=0.9, random_state=0, normalize=normalize) clf_l1.fit(X_train, y_train) assert_almost_equal(np.sum(np.abs(clf_l1.coef_)), 0) # comparison with sgd sgd = SGDRegressor(alpha=0.01, max_iter=100, eta0=1, learning_rate='constant', fit_intercept=True, random_state=0) sgd.fit(X_trans[:n_train], y_train) test_l2_sgd = np.mean((y_test - sgd.predict(X_trans[n_train:]))**2) clf = AdamRegressor(transformer, max_iter=100, warm_start=True, verbose=False, fit_intercept=True, loss=loss, alpha=0.01, random_state=0, normalize=normalize) clf.fit(X_train, y_train) test_l2 = np.mean((y_test - clf.predict(X_test))**2) assert test_l2 <= test_l2_sgd
for j in jsondata.keys(): if j in SubjDict: data[SubjDict[j]]=jsondata[j] train_X.append(data) train_Y.append(jsondata['Mathematics']) f.close() #fit/train data train_X=numpy.array(train_X) test_X=numpy.array(test_X) rsmax=0 zmax=0 LR=SGDRegressor(epsilon=0.17,fit_intercept=False,penalty='elasticnet', loss='epsilon_insensitive',random_state=692,alpha=0.000001, n_iter=4).fit(train_X[:,1:],train_Y) test_Y = LR.predict(test_X[:,1:]) for i in range(len(test_Y)): if test_Y[i]<2: test_Y[i]=2 elif test_Y[i]>7: test_Y[i]=7 #### Predict the result if local: #import test output filename = "sample-test2.out.json" f = open(filename) z=0 for x in test_Y: y=int(f.readline())
plt.plot(X, Y, 'b.') plt.plot(X_test, Y_test, 'r-') plt.show() """ # 示例2:直接使用梯度下降法进行岭回归,penalty参数:使用的是哪种正则惩罚项,alpha:L1的alpha参数,n_iter:迭代次数 # 使用SGDRegressor可以替代Ridge Regtession、Lasso、Elastic Net三种回归方法 ridge = SGDRegressor(penalty="elasticnet", alpha=0.001, l1_ratio=0.15, n_iter=1000) # 训练数据 ridge.fit(X, Y) # 打印截距 print(ridge.intercept_) # 打印参数W print(ridge.coef_) X_test = np.array([[0], [2]]) # 预测数据 Y_test = ridge.predict(X_test) plt.plot(X, Y, 'b.') plt.plot(X_test, Y_test, 'r-') plt.show()
X_train_poly_scaled = poly_scaler.fit_transform(X_train) X_val_poly_scaled = poly_scaler.transform(X_val) sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, penalty=None, eta0=0.0005, warm_start=True, learning_rate="constant", random_state=42) n_epochs = 500 train_errors, val_errors = [], [] for epoch in range(n_epochs): sgd_reg.fit(X_train_poly_scaled, y_train) y_train_predict = sgd_reg.predict(X_train_poly_scaled) y_val_predict = sgd_reg.predict(X_val_poly_scaled) train_errors.append(mean_squared_error(y_train, y_train_predict)) val_errors.append(mean_squared_error(y_val, y_val_predict)) best_epoch = np.argmin(val_errors) best_val_rmse = np.sqrt(val_errors[best_epoch]) plt.annotate( 'Best model', xy=(best_epoch, best_val_rmse), xytext=(best_epoch, best_val_rmse + 1), ha="center", arrowprops=dict(facecolor='black', shrink=0.05), fontsize=16, )
from sklearn.linear_model import LinearRegression, SGDRegressor from sklearn.metrics import mean_squared_error # 参数一次或自变量一次的为线性模型,但参数一次的未必为线性关系 # 损失函数:平方差 # 优化方法 # 正规方程:w = (x.T*X).逆*x.T*y,求解速度慢,适合小数据集 # 梯度下降 # 获取数据集 data = load_boston() # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22) # 特征工程 trans = StandardScaler() x_train = trans.fit_transform(x_train) x_test = trans.transform(x_test) # 线性回归 estimator = SGDRegressor(eta0=0.001, max_iter=10000) print(x_train.shape, y_train.shape) estimator.fit(x_train, y_train) # 模型评估:用均方误差衡量 print(estimator.coef_) # 回归系数 print(estimator.intercept_) # 偏置 y_pred = estimator.predict(x_test) score = mean_squared_error(y_test, y_pred) print(score)
Y_test = ss_Y.transform(Y_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) y_test = ss_y.transform(y_test.reshape(-1,1)) 需要把一维数组转化为二维数组形式 """ """以下是分别采用线性回归和随机梯度回归对模型进行参数估计以及预测""" lr = LinearRegression() lr.fit(X_train, Y_train) lr_Y_predict = lr.predict(X_test) sgdr = SGDRegressor(max_iter=5) # 设置最大跌在次数为5 sgdr.fit(X_train, Y_train.ravel()) sgdr_Y_predict = sgdr.predict(X_test) """ 我的Y是2D的形式(shapes, 1),要把二维的形式改成1D的形式(shapes, ) 这就可以对fit输入的Y_train作y_train.ravel()这样的转换 即把sgdr.fit(Y_train, Y_train)代码修改为sgdr.fit(X_train, Y_train.ravel()) warning就会消失了 """ """接下来要评估准确性""" print 'the value of default measurement of LinearRegression is:', lr.score( X_test, Y_test) print 'the value of R_squared of LinearRegression is', r2_score( Y_test, lr_Y_predict) print 'the mean squared error of LinerRegression is', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_regression from sklearn.linear_model import SGDRegressor np.random.seed(0) x, y = make_regression(n_samples=100, n_features=1, noise=10) plt.scatter(x, y) model = SGDRegressor(max_iter = 1000, eta0 = 0.001) model.fit(x, y) print('Coeff R2 = ', model.score(x, y)) plt.scatter(x, y) plt.plot(x, model.predict(x), c='pink', lw = 3)
from sklearn.preprocessing import StandardScaler plt.figure() # 实例化作图变量 plt.title('single variable') # 图像标题 plt.xlabel('x') # x轴文本 plt.ylabel('y') # y轴文本 plt.grid(True) # 是否绘制网格线 X_scaler = StandardScaler() y_scaler = StandardScaler() X = [[50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150]] y = [[150], [200], [250], [280], [310], [330], [150], [200], [250], [280], [310], [330], [150], [200], [250], [280], [310], [330], [150], [200], [250], [280], [310], [330], [150], [200], [250], [280], [310], [330], [150], [200], [250], [280], [310], [330], [150], [200], [250]] X = X_scaler.fit_transform(X) y = y_scaler.fit_transform(y) X_test = [[40], [400]] # 用来做最终效果测试 X_test = X_scaler.transform(X_test) plt.plot(X, y, 'k.') model = SGDRegressor() model.fit(X, y.ravel()) y_result = model.predict(X_test) plt.plot(X_test, y_result, 'g-') plt.show() # 展示图像
for i in range(0,N): temp = input().split("\t") dates.append(datetime.strptime(temp[0], '%m/%d/%Y %H:%M:%S')) try: stock_list.append(float(temp[1])) except: stock_list.append(np.nan) missing_list.append(int(i)) df_stock = pd.DataFrame({"date":dates,"price":stock_list}) missing_dates = df_stock[df_stock['price'].isnull()]['date'].values missing_dates = missing_dates.astype('datetime64[D]').astype(int) missing_dates = [[x] for x in missing_dates] df_stock = df_stock.dropna() X= [[x] for x in df_stock['date'].values] y= df_stock['price'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False) mdl = SGDRegressor(shuffle=False, max_iter=5000, learning_rate='optimal', random_state=0, n_iter_no_change=30) mdl.fit(X_train, y_train) y_pred = mdl.predict(missing_dates) for pred in y_pred: print(pred)
data = comm.bcast(data, root=0) n_jobs = get_args().n_jobs chunk_len = int(len(data['grid']) / n_jobs) offset = chunk_len * rank data_chunk = numpy.asarray(data['grid'])[offset:(offset + chunk_len)] min_mse = 1000 best_param = {} for set in data_chunk: model = SGDRegressor(alpha=set[1], max_iter=set[0], random_state=settings.seed) model.fit(data['X'], data['y']) preds = model.predict(data['X_t']) pred = preds.reshape(len(preds)) real = data['y_t'] mse = mean_squared_error(real, pred) if mse < min_mse: min_mse = mse best_param = set if rank == 0: result.append([min_mse, best_param]) for i in range(1, n_jobs): p_res = comm.recv(source=i) result.append(p_res)
# In[13]: sgd = SGDRegressor() sgd # In[14]: sgd = SGDRegressor().fit(topics.values, nums.favorite_count) # Well, that was **much** faster... # In[15]: predicted_favorites = sgd.predict(topics.values) predicted_favorites # In[16]: np.sum(predicted_favorites >= 1) # Well that seems more "balanced" at least. # And it's nice to have a continuous score. # In[17]: np.sum(nums.favorite_count.values >= 1)
# y_test = ss_y.transform(pd.colume_or_1d(y_test)) #数组新的shape属性应该要与原来的配套,如果等于-1的话,那么Numpy会根据剩下的维度计算出数组的另外一个shape属性值。 from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(x_train, y_train) lr_y_predict = lr.predict(x_test) #线性回归 from sklearn.linear_model import SGDRegressor sgd = SGDRegressor(max_iter=5) #最大迭代5 sgd.fit(x_train, y_train.ravel()) #将y降维为1维 sgd_y_predict = sgd.predict(x_test) #随机梯度下降 print("The value of default measurement of LinearRegression is ", lr.score(x_test, y_test)) from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error print("The value of R-squared of LinearRegression is ", r2_score(y_test, lr_y_predict)) print( "The mean of squared error of LinearRegression is ", mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))) print( "The mean of absolute error of LinearRegression is ", mean_absolute_error(ss_y.inverse_transform(y_test),
#!/usr/bin/python # -*- coding: UTF-8 -*- # 文件名: ridge_regression.py import numpy as np from sklearn.linear_model import Ridge from sklearn.linear_model import SGDRegressor __author__ = 'yasaka' X = 2 * np.random.rand(100, 1) y = 4 + 3 * X + np.random.randn(100, 1) """ ridge_reg = Ridge(alpha=1, solver='sag') ridge_reg.fit(X, y) print(ridge_reg.predict(1.5)) print(ridge_reg.intercept_) print(ridge_reg.coef_) """ sgd_reg = SGDRegressor(penalty='l2', n_iter=1000) sgd_reg.fit(X, y.ravel()) print(sgd_reg.predict(1.5)) print("W0=", sgd_reg.intercept_) print("W1=", sgd_reg.coef_)
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + Bayesian Ridge Regression br = BayesianRidge() br.fit(reduced_training_features, training_labels) preds = br.predict(reduced_testing_features) score = br.score(reduced_testing_features,testing_labels) print 'PCA + Bayesian Ridge Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Stochastic Gradient Descent Regression from sklearn.linear_model import SGDRegressor sgd = SGDRegressor() sgd.fit(training_features, training_labels) preds = sgd.predict(testing_features) score = sgd.score(testing_features,testing_labels) print 'SGD Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + Stochastic Gradient Descent Regression sgd = SGDRegressor() sgd.fit(reduced_training_features, training_labels) preds = sgd.predict(reduced_testing_features) score = sgd.score(reduced_testing_features,testing_labels) print 'PCA + SGD Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Polynomial Regression
print(submission_features.head(3)) # # Prediction # In[58]: X_test = np.asarray(submission_features)[:, :-2] y_true = np.asarray(submission_features)[:, -2] clf = SGDRegressor() y_pred = np.zeros(len(X_test)) local_df = features[features.DATE < df2.DATE[0] - DateOffset(days=3)] X_train = np.asarray(local_df)[:, :-2] y_train = np.asarray(local_df)[:, -2] clf.partial_fit(X_train, y_train) y_pred[0] = clf.predict(X_test[0]) for i in trange(1, len(X_test)): local_df = features[(features.DATE > df2.DATE[i - 1]) & (features.DATE < (df2.DATE[i] - DateOffset(days=3)))] X_train = np.asarray(local_df)[:, :-2] y_train = np.asarray(local_df)[:, -2] if X_train.shape[0] != 0: clf.partial_fit(X_train, y_train) y_pred[i] = clf.predict([X_test[i]])[0] # In[59]: y_pred_round = [int(math.ceil(x)) if x > 0 else 0 for x in y_pred] # print(y_pred_round) # # Output
from sklearn.linear_model import SGDRegressor X = [[0, 0], [2, 1], [5, 4]] y = [0, 2, 2] # penalty: 损失函数惩罚项,取值none、l1、l2、elasticnet # l2惩罚项:对应岭回归。l1惩罚性:对应Lasso回归 reg = SGDRegressor(penalty="l2", max_iter=10000) reg.fit(X, y) print(reg.predict([[4, 3]])) print(reg.intercept_) print(reg.coef_)
train_prepared = full_pipeline.fit_transform(train_set) test_prepared = full_pipeline.fit_transform(test_set) from sklearn.base import clone from sklearn.metrics import mean_squared_error from sklearn.linear_model import SGDRegressor sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty="l2", learning_rate="constant", eta0=0.0005) minimum_val_error = float("inf") best_epoch = None best_model = None for epoch in range(1000): sgd_reg.fit(train_prepared, train_label) # continues where it left off y_val_predict = sgd_reg.predict(test_prepared) val_error = mean_squared_error(y_val_predict, test_label) print(val_error) if val_error < minimum_val_error: minimum_val_error = val_error best_epoch = epoch best_model = clone(sgd_reg) print(best_epoch) test = pd.read_csv("test.csv",index_col="Id") missed_cols = ['Utilities_NoSeWa', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other', 'Heating_Floor', 'Heating_OthW', 'Electrical_Mix', 'GarageQual_Ex', 'PoolQC_Fa', 'MiscFeature_TenC'] for col in missed_cols: test[col] = [0] * 1459
def stacklearning(self): class sparseNorm(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): from sklearn import preprocessing Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values)) return Y fm = sgd.FMRegression( n_iter=4743, init_stdev=0.1, rank=100, l2_reg_w=0, l2_reg_V=0, step_size=0.1, ) fm = sgd.FMRegression( n_iter=9943, init_stdev=0.1, rank=219, l2_reg_w=0, l2_reg_V=0.06454, step_size=0.1, ) pipe = make_pipeline(sparseNorm(), fm) calcACC(pipe, X=X2) xgb = xgboost.XGBRegressor( n_estimators=100, max_depth=7, gamma=0, colsample_bytree=0.1 ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=367, learning_rate=0.06,feature_fraction=0.14, max_depth=28, min_data_in_leaf=8 ) rgf = RGFRegressor( max_leaf=1211, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.93, min_samples_leaf=2 ) rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=56,min_samples_split=2, max_features=0.21 ) rf = RandomForestRegressor() ext = ExtraTreesRegressor( n_estimators=384,max_features= 2228, min_samples_split= 0.01,max_depth= 856, min_samples_leaf= 1 ) svr = SVR( gamma=9.5367431640625e-07, epsilon=0.0009765625, C= 2048.0 ) #test combination desNew = make_pipeline(extdescriptorNew(),rf) morNew = make_pipeline(extMorganNew(),rf) kotNew = make_pipeline(extklekotaTothNew(),rf) macNew = make_pipeline(extMACCSNew(),rf) desMac = make_pipeline(extDescriptorMACCS(),rf) morMac = make_pipeline(extMorganMACCS(),rf) kotMac = make_pipeline(extKlekotaTothMACCS(),rf) morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf) des = make_pipeline(extOnlyDescriptor(),rf) mor = make_pipeline(extOnlyMorgan(),rf) kot = make_pipeline(extOnlyklekotaToth(),rf) mac = make_pipeline(extOnlyMACCS(),rf) all = make_pipeline(extAll(),rf) allwithoutNew = make_pipeline(extAllwithoutNew(),rf) allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf) allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf) testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew, "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without " "new":allwithoutNew, "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes} #10fold cv = KFold(n_splits=10, shuffle=True, random_state=0) #Fingerprinttest resultDic={} resultDic2={} for name,model in testDic.items(): #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1) #calcACC(model,X=X,y=y2,name=name) Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc) RMSETmp = Scores['test_RMSE'].mean() CORRTmP = Scores['test_Correlation coefficient'].mean() resultDic.update({name:[RMSETmp,CORRTmP]}) print(name,RMSETmp,CORRTmP) #stacking alldata = make_pipeline(extAll()) # random forest #1.1546 0.70905 stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1) # Light Gradient boosting # 1.160732 0.703776 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1) # XGboost # 1.1839805 0.689571 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1) # Regularized greedily forest # 1.17050 0.6992 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1) #pls 22.808047774809697 0.6410026452910016 i=4 for i in np.arange(3,11,1): pls = PLSRegression(n_components=i) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0) calcACC(testmodel) pls = PLSRegression(n_components=4) #SVR svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592, epsilon=0.0009765625,) svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1) calcACC(svr) #Extratree 1.157420824123527 0.7061010221224269 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1) calcACC(testmodel) #k-NN nbrs = KNeighborsRegressor(3) ##Linear regressions #Stochastic Gradient Descenta sgd = SGDRegressor(max_iter=1000) # Ridge for i in [1,10,100,1000]: ridge = Ridge(alpha=i) calcACC(ridge) ridge = Ridge(alpha=45.50940042350705) calcACC(ridge) # multiple linear lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1)) calcACC(lin) #stacking #0.69 testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1) #1.1532 0.70926 testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf, verbose=1) #1.16420 0.7041 testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1) #1.16379 0.7044 stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1) testmodel = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1) #1.1535496740699531 0.7108839199109559 pcaFeature = make_pipeline(extPCA()) testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=rf,verbose=1) #1.181801005432221 0.6889745579620922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=lgbm,verbose=1) #0.70613 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=xgb,verbose=1) #0.71641717 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #0.7146922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #new features pcaFeature = make_pipeline(extPCA()) #old pipe1 = make_pipeline(extMACCS(), rf) pipe2 = make_pipeline(extMorgan(), rf) pipe3 = make_pipeline(extDescriptor(), rf) pipe4 = make_pipeline(extPCA(), rgf) pipe7 =make_pipeline(extDescriptor(), rgf) pipe8 =make_pipeline(extDescriptor(), rgf) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') pls = PLSRegression(n_components=4) extMACCSdata = make_pipeline(extMACCS()) nbrsPipe = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) ave = extAverage() withoutdesc = make_pipeline(extMACCS()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1) #0.70 stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) #0.69###################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) #0.70 stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1) #0.71 stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1) ########################### ########################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1) stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1) ########################### #stackingwithknn stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1) #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1) cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) cv = KFold(n_splits=10, shuffle=True, random_state=0) St1Scores = cross_validate(stack1,X,y,cv=cv) St1Scores['test_score'].mean()**(1/2) St2Scores = cross_validate(stack2,X,y,cv=cv) St2Scores['test_score'].mean()**(1/2) St3Scores = cross_validate(stack3,X,y,cv=cv) St3Scores['test_score'].mean()**(1/2) stackScore = cross_validate(stack, X, y, cv=cv) stackScore['test_score'].mean()**(1/2) lgbmScores =cross_validate(lgbm,X,y,cv=cv) lgbmScores['test_score'].mean()**(1/2) rgfScores = cross_validate(rgf,X,y,cv=cv) rgfScores['test_score'].mean()**(1/2) RFScores = cross_validate(rf,X,y,cv=cv) RFScores['test_score'].mean()**(1/2) scores = cross_validate(stack2,X,y,cv=cv) scores['test_score'].mean()**(1/2) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking')) stack3.fit(X, y) y_pred = stack3.predict(X_train) y_val = stack3.predict(X_test) #stack3.score(X_train, y_train) exX = preprocess(extractDf, changeList) valy = (10 **(stack3.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) stack1.fit(X, y) valy = (10 **(stack1.predict(exX))).tolist() sgd.fit(X,y) valy = (10 **(sgd.predict(exX))).tolist() rgfpipe = make_pipeline(extMACCS(), rf) rgf.fit(X,y) valy = (10 **(rgf.predict(exX))).tolist() nbrs.fit(X,y) valy = (10 **(nbrs.predict(exX))).tolist() pipe = make_pipeline(extMACCS(), rf) pipe.fit(X,y) valy = (10 **(pipe.predict(exX))).tolist() rf.fit(X, y) y_pred = rf.predict(X_train) y_val = rf.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(rf.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) lgbm.fit(X, y) #y_pred = pipe1.predict(X_train) #y_val = pipe1.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(lgbm.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
class Model(object): def __init__(self, params): self.model_class = params['class'] self.model = {} self.feature_constructor = None self.all_possible_decisions = [] self.X = [] self.y = [] self.buffer = 0 def initialize(self): if self.model_class == 'scikit': self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001) self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict') elif self.model_class == 'lookup': self.model = {} def clean_buffer(self): self.X = [] self.y = [] self.buffer = 0 def return_design_matrix(self, all_decision_states, reward=None): if self.model_class == 'lookup_table': return all_decision_states, reward elif self.model_class == 'scikit': X, y = [], [] for decision_state in all_decision_states: information, decision_taken = decision_state tr = {} tr['-'.join([str(information[1]), decision_taken])] = 1 tr['-'.join([str(information[0]), decision_taken])] = 1 tr['-'.join([str(information[0]), str(information[1]), decision_taken])] = 1 X.append(tr) y.extend([reward]) X = self.feature_constructor.transform(X).toarray() return X, y def fit(self, X, y): if self.model_class == 'scikit': # X, y = self.shuffle_data(X, y) self.model.partial_fit(X, y) print self.model.score(X, y) if self.model_class == 'lookup_table': for decision_state in X: if decision_state not in self.model: for d in self.all_possible_decisions: self.model[(decision_state[0], d)] = DecisionState() self.model[decision_state].count += 1 updated_value = self.model[decision_state].value_estimate + (1.0 / self.model[decision_state].count) * ( y - self.model[decision_state].value_estimate) self.model[decision_state].value_estimate = updated_value def predict(self, X): if self.model_class == 'scikit': return self.model.predict(X) if self.model_class == 'lookup_table': if X not in self.model: for d in self.all_possible_decisions: self.model[(X[0], d)] = DecisionState() return self.model[X].value_estimate @staticmethod def shuffle_data(a, b): assert len(a) == len(b) p = np.random.permutation(len(a)) return a[p], b[p]
# 从sklearn.linear_model导入LinearRegression. from sklearn.linear_model import LinearRegression # 使用默认配置初始化线性回归器LinearRegression lr = LinearRegression() # 使用训练数据进行参数估计。 lr.fit(X_train, y_train) # 对测试数据进行回归预测。 lr_y_predict = lr.predict(X_test) # 从sklearn.linear_model导入SGDRegression from sklearn.linear_model import SGDRegressor sgdr = SGDRegressor() sgdr.fit(X_train, y_train) sgdr_y_predict = sgdr.predict(X_test) """ 使用三种回归评价机制以及两种调用R-squared评价模块的方法,对本节模型的回归性能做出评价。 """ # 使用LinearRegression模型自带的评估模块,并输出评价结果。 print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test)) # 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用r2_score模块,并输出评估结果 print('The value of R-squared of LinearRegression is', r2_score(y_test, lr_y_predict)) # 使用mean_squared_error模块,并输出评估结果。 print(
# forget the oldest train_data_n_frames = train_data_n_frames[1:] render() n_frames_reward -= rewards[0] rewards = rewards[1:] # try predict all actions action = 0#env.action_space.sample() curr_max_reward_for_action = 0.; before_action_observation = observation for try_action in range(env.action_space.n): try_data = u.concatNewStep(train_data_n_frames, observation, try_action) predicted = rf.predict([u.to1D(try_data)])[0] print(predicted) if (predicted > curr_max_reward_for_action): action = try_action curr_max_reward_for_action = predicted #print(curr_max_reward_for_action) # apply the best predicted action observation, reward, done, info = env.step(action) print('\naction: (' + str(action) + ') reward: ' + str(reward)) n_frames_reward += reward rewards = np.hstack((rewards, reward)) train_data_n_frames = u.concatNewStep(train_data_n_frames, before_action_observation, action) if done:
print('Linear Regression Train RMSE:', train_rmse) train_r2 = r2_score(y_train, y_train_pred) print('Linear Regression Train R^2:', train_r2) # 테스트 세트의 예측값 y_test_pred = lin_reg.predict(X_test) test_mse = mean_squared_error(y_test, y_test_pred) test_rmse = np.sqrt(test_mse) test_r2 = r2_score(y_test, y_test_pred) print('Linear Regression Test RMSE:', test_rmse) print('Linear Regression Test R^2:', test_r2) # LinearRegression vs SGDRegressor sgd_reg = SGDRegressor(random_state=1) # 모델 생성 sgd_reg.fit(X_train, y_train) # 모델 훈련 y_train_pred = sgd_reg.predict(X_train) # 학습 세트 예측값 # -> 학습 세트의 RMSE, R2-score y_test_pred = sgd_reg.predict(X_test) # 테스트 세트 예측값 # -> 테스트 세트의 RMSE, R2-Score # Scaler 사용 -> Pipeline pipe1 = Pipeline([('scaler', StandardScaler()), ('regressor', LinearRegression())]) pipe1.fit(X_train, y_train) # 학습 y_train_pred = pipe1.predict(X_train) # Train 예측값 # -> Train RMSE, R2-score y_test_pred = pipe1.predict(X_test) # Test 예측값 scaler = StandardScaler() X_train_scale = scaler.fit_transform(X_train) X_test_scale = scaler.transform(X_test)
Y_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) Y_train = Y_scaler.fit_transform(Y_train) X_test = X_scaler.transform(X_test) Y_test = Y_scaler.transform(Y_test) print X_train[0:5] print len(X_train) print Y_test clf =SGDRegressor(loss="squared_loss") scores = cross_val_score(clf,X_train,Y_train,cv=5) print scores print np.mean(scores) clf.fit_transform(X_train,Y_train) pred = clf.predict(X_test) print clf.score(X_test,Y_test) # correlation(X_train,Y_train) # feature_selection(X_train,Y_train) scatter_plot(X_train,Y_train)
#!/usr/bin/python # -*- coding: UTF-8 -*- # 文件名: elastic_net.py import numpy as np from sklearn.linear_model import ElasticNet from sklearn.linear_model import SGDRegressor __author__ = 'yasaka' X = 2 * np.random.rand(100, 1) y = 4 + 3 * X + np.random.randn(100, 1) elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15) elastic_net.fit(X, y) print(elastic_net.predict(1.5)) sgd_reg = SGDRegressor(penalty='elasticnet', n_iter=1000) sgd_reg.fit(X, y.ravel()) print(sgd_reg.predict(1.5))
def main(): inmatesMap = mapCreator() featureVector = createFeatureVector() allInmateCrimes = [] allInmateCrimesYValues = [] allInmates = [] allInmateYValues = [] for inmate in inmatesMap: if 'IncarcerationDate' not in inmatesMap[inmate]: continue if inmatesMap[inmate]['PrisonReleaseDate'] == '': inmatesMap[inmate]['PrisonReleaseDate'] = inmatesMap[inmate]['IncarcerationDate'] + datetime.timedelta(days=36525) if (inmatesMap[inmate]["PrisonReleaseDate"] - inmatesMap[inmate]["IncarcerationDate"]).days <= 0: continue currentPerson = extractFeatures(inmatesMap[inmate], featureVector) sentenceLength = (inmatesMap[inmate]["PrisonReleaseDate"] - inmatesMap[inmate]["IncarcerationDate"]).days if 'CURRENT_OFFENSES' in inmatesMap[inmate]: for offense in inmatesMap[inmate]['CURRENT_OFFENSES']: crimeDescription = "CURRENT_" + offense["adjudicationcharge_descr"] allInmateCrimes.append(crimeDescription) allInmateCrimesYValues.append(sentenceLength) allInmates.append(currentPerson) # allInmateYValues.append(inmatesMap[inmate]["prisonterm"]) allInmateYValues.append(sentenceLength) X = allInmates[:10000] y = allInmateYValues[:10000] # print testSet # print testSetY sgd = SGDRegressor(loss='epsilon_insensitive', fit_intercept=True, learning_rate='constant', n_iter=4, penalty='none', epsilon=0) sgd.fit(X, y) sgdPredictedSetY = [] sgdTrueSetY = [] for i in range(10001, 20001): sgdTrueSetY.append(allInmateYValues[i]); sgdPredictedSetY.append(sgd.predict(allInmates[i])) percentErrors = [] print "SGD Mean absolute test error:", util.mean_absolute_percentage_error(sgdTrueSetY, sgdPredictedSetY, percentErrors) print "SGD Standard deviation:", np.std(np.array(percentErrors)) svr = svm.SVR() svr.fit(X, y) svrPredictedSetY = [] svrTrueSetY = [] for i in range(10001, 20001): print "true value:", allInmateYValues[i] print "predicted value:", svr.predict(allInmates[i]) print "Difference in true and predicted values:", allInmateYValues[i] - svr.predict(allInmates[i]) svrTrueSetY.append(allInmateYValues[i]); svrPredictedSetY.append(svr.predict(allInmates[i])) percentErrors = [] print "SVR Mean absolute test error:", util.mean_absolute_percentage_error(svrTrueSetY, svrPredictedSetY, percentErrors) print "SVR Standard deviation:", np.std(np.array(percentErrors)) # baselineTest(allInmateCrimes[:10000], allInmateCrimesYValues[:10000]) nbAllInmates = nbTestTransform(allInmates) nbAllInmateYValues = nbRound(allInmateYValues) nbTestSet = [nbAllInmates[i] for i in range(0, 10000)] nbTestSetY = [nbAllInmateYValues[i] for i in range(0, 10000)] nb = BernoulliNB() nb.fit(np.array(nbTestSet), np.array(nbTestSetY)) nbTrueSentenceLength = [] nbTestSentenceLength = [] for i in range(10001, 20001): nbTrueSentenceLength.append(nbAllInmateYValues[i] * 10.0) nbTestSentenceLength.append(nb.predict(nbAllInmates[i] * 10.0)) # print nbTrueSentenceLength # print nbTestSentenceLength percentErrors = [] print "Naive Bayes Mean absolute test error:", util.mean_absolute_percentage_error(nbTrueSentenceLength, nbTestSentenceLength, percentErrors) print "Naive Bayes standard deviation:", np.std(np.array(percentErrors))
class SGDPolyCartPoleSolver: def __init__(self, n_episodes=1000, max_env_steps=None, gamma=0.9, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.005, alpha=0.0001, batch_size=32, monitor=False): self.memory = deque(maxlen=100000) self.env = gym.make('CartPole-v0') if monitor: # whether or not to display video self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True) # hyper-parameter setting self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.alpha = alpha self.n_episodes = n_episodes self.batch_size = batch_size self.feature_tuning = PolynomialFeatures(interaction_only=True) if max_env_steps is not None: self.env._max_episode_steps = max_env_steps # Init model self.model = SGDRegressor( alpha=self.alpha, learning_rate='optimal', shuffle=False, warm_start=True) # Initialize feature tunning self.feature_tuning.fit(np.reshape(np.hstack((self.env.reset(), 0)), [1, 5])) # Initialize model self.model.partial_fit(self.preprocess_state(self.env.reset(), 0), [0]) def remember(self, state, action, reward, next_state, done): """In this method, the (s, a, r, s') tuple is stored in the memory""" self.memory.append((state, action, reward, next_state, done)) def choose_action(self, state, epsilon): """Chooses the next action according to the model trained and the policy""" qsa = np.asarray([self.model.predict(self.preprocess_state(state, a)) for a in range(self.env.action_space.n)]).flatten() return self.env.action_space.sample() if (np.random.random() <= epsilon) \ else np.argmax(qsa) # exploits the current knowledge if the random number > epsilon, otherwise explores def get_epsilon(self, episode): """Returns an epsilon that decays over time until a minimum epsilon value is reached; in this case the minimum value is returned""" return max(self.epsilon_min, self.epsilon * math.exp(-self.epsilon_decay * episode)) def preprocess_state(self, state, action): """State and action are stacked horizontally and its features are combined as a polynomial to be passed as an input of the approximator""" # poly_state converts the horizontal stack into a combination of its parameters i.e. # [1, s_1, s_2, s_3, s_4, a_1, s_1 s_2, s_1 s_3, ...] poly_state = self.feature_tuning.transform(np.reshape(np.hstack((state, action)), [1, 5])) return poly_state def replay(self, batch_size): """Previously stored (s, a, r, s') tuples are replayed (that is, are added into the model). The size of the tuples added is determined by the batch_size parameter""" x_batch, y_batch = [], [] minibatch = random.sample(self.memory, min(len(self.memory), batch_size)) for state, action, reward, next_state, done in minibatch: qsa_s_prime = np.asarray([self.model.predict(self.preprocess_state(next_state, a)) for a in range(self.env.action_space.n)]) qsa_s = reward if done \ else reward + self.gamma * np.max(qsa_s_prime) x_batch.append(self.preprocess_state(state, action)[0]) y_batch.append(qsa_s) self.model.partial_fit(np.array(x_batch), np.array(y_batch)) def run(self): """Main loop that controls the execution of the agent""" scores100 = deque(maxlen=100) scores = [] for e in range(self.n_episodes): state = self.env.reset() done = False t = 0 # t counts the number of time-steps the pole has been kept up while not done: action = self.choose_action(state, self.get_epsilon(e)) next_state, reward, done, _ = self.env.step(action) self.remember(state, action, reward, next_state, done) self.replay(self.batch_size) state = next_state t += 1 scores100.append(t) scores.append(t) mean_score = np.mean(scores100) if e % 100 == 0: print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score)) # noinspection PyUnboundLocalVariable print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score)) return scores
plt.show() #%% Ridge Regression closed form from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky") ridge_reg.fit(X, y) ridge_reg.predict([[1.5]]) #%% Ridge Regression SGD from sklearn.linear_model import SGDRegressor sgd_reg = SGDRegressor(penalty="l2") sgd_reg.fit(X, y.ravel()) sgd_reg.predict([[1.5]]) #%% Lasso Regression from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import Lasso np.random.seed(42) m = 20 X = 3 * np.random.rand(m, 1) y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5
pred = users_current.sum(axis=1)/users_current.getnnz(axis=1) print pred, 'pred last' else: movie_current = R_m[probe_movies[i:min(i+batch_size, probe_num)], :] m_mean = movie_current.sum(axis=1)/movie_current.getnnz(axis=1) print movie_current.getnnz(axis=1), 'sd' m_stdev = np.sqrt((np.sum(np.power((movie_current - m_mean*(movie_current!=0)),2),axis=1).flatten())/movie_current.getnnz(axis=1)).T pred = movie_current.sum(axis=1)/movie_current.getnnz(axis=1)+(u_mean-all_movie_avg)*(m_stdev/all_movie_stdev) print pred, 'pred last' ''' m_mean = movie_avg[probe_movies[i:min(i + batch_size, probe_num)]] u_mean = np.array([u_mean]).T m_mean = np.array([m_mean]).T preding = np.concatenate((u_mean, m_mean), axis=1) pred = lin_model.predict(preding) #print u_mean, m_mean, pred given = probe_ratings[i:min(i + batch_size, probe_num)] pred = np.maximum( np.minimum(pred, (5 - 3 - 0.60951619727280626) * np.ones(len(pred))), (1 - 3 - 0.60951619727280626) * np.ones(len(pred))) probe_se += np.sum(np.power((given - pred), 2)) #print math.sqrt(probe_se/(i+batch_size)) print 'trained avg1', math.sqrt(probe_se / probe_num) # sgd fitter lin_model = SGDRegressor()
lr = LinearRegression() # 使用训练数据进行参数估计。 lr.fit(X_train, y_train) # 对测试数据进行回归预测。 lr_y_predict = lr.predict(X_test) # 从sklearn.linear_model导入SGDRegressor。 from sklearn.linear_model import SGDRegressor # 使用默认配置初始化线性回归器SGDRegressor。 sgdr = SGDRegressor() # 使用训练数据进行参数估计。 sgdr.fit(X_train, y_train) # 对测试数据进行回归预测。 sgdr_y_predict = sgdr.predict(X_test) # 使用LinearRegression模型自带的评估模块,并输出评估结果。 print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test)) # 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估。 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用r2_score模块,并输出评估结果。 print( 'The value of R-squared of LinearRegression is', r2_score(y_test, lr_y_predict) ) # 使用mean_squared_error模块,并输出评估结果。 print('The mean squared error of LinearRegression is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))) # 使用mean_absolute_error模块,并输出评估结果。 print('The mean absoluate error of LinearRegression is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))
eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant") rmse_val_score = [] rmse_train_score = [] model_list = [] X_train, X_val, y_train, y_val = train_test_split( X_train_dataset,y_train_dataset, test_size=0.2, random_state=42) sgd_regressor.fit(X_train,y_train) # kf = KFold(n_splits=100, shuffle=True) # for train_index, test_index in kf.split(X_train_dataset): for i in range(300): y_pred = sgd_regressor.predict(X_train) y_true = y_train rmse_train_score.append(rmse(y_pred, y_true)) y_pred = sgd_regressor.predict(X_val) y_true = y_val rmse_val_score.append(rmse(y_pred, y_true)) model_list.append(sgd_regressor) coef = sgd_regressor.coef_.copy() intercept = sgd_regressor.intercept_.copy() sgd_regressor = SGDRegressor( eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant") sgd_regressor.fit(X_train,y_train, coef_init=coef, intercept_init=intercept)
from sklearn.linear_model import SGDRegressor from sklearn.preprocessing import StandardScaler plt.figure() # 实例化作图变量 plt.title('single variable') # 图像标题 plt.xlabel('x') # x轴文本 plt.ylabel('y') # y轴文本 plt.grid(True) # 是否绘制网格线 X_scaler = StandardScaler() y_scaler = StandardScaler() #X = [[50],[100],[150],[200],[250],[300]] #y = [[150],[200],[250],[280],[310],[330]] X = [[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300]] y = [[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330]] X = X_scaler.fit_transform(X) y = y_scaler.fit_transform(y) X_test = [[40],[400]] # 用来做最终效果测试 X_test = X_scaler.transform(X_test) plt.plot(X, y, 'k.') model = SGDRegressor() model.fit(X, y.ravel()) y_result = model.predict(X_test) print y_result plt.plot(X_test, y_result, 'g-') plt.show() # 展示图像
import numpy as np from sklearn.linear_model import Ridge from sklearn.linear_model import SGDRegressor X = 2 * np.random.rand(100, 1) y = 4 + 3 * X + np.random.randn(100, 1) # 方法一 ridge_reg = Ridge(alpha=1, solver="auto") # alpha是惩罚项中的alpha,slover是自动选择 ridge_reg.fit(X, y) """ 由于在新版的sklearn中,所有的数据都应该是二维矩阵,哪怕它只是单独一行或一列 (比如仅仅只用了一个样本数据),所以需要使用.reshape(1,-1)进行转换 """ print(ridge_reg.predict(np.array(1).reshape(1, -1))) # 预测值 print(ridge_reg.intercept_) # 截距 print(ridge_reg.coef_) # 参数 # 方法二 # penalty是惩罚函数,可选l1或l2(默认为l2),max_iter是最大迭代次数(可以不填默认1000) sgd_reg = SGDRegressor(penalty="l2", max_iter=10000) sgd_reg.fit(X, y.ravel()) # ravel函数把y由列向量变为行向量 print(sgd_reg.predict(np.array(1).reshape(1, -1))) # 预测值 print(sgd_reg.intercept_) # 截距 print(sgd_reg.coef_) # 参数
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle=True) # ---------------------------------------------------- # Applying SGDRegressor Model SGDRegressionModel = SGDRegressor(random_state=33) SGDRegressionModel.fit(X_train, y_train) # Calculating Details print('SGD Regression Train Score is : ',SGDRegressionModel.score(X_train, y_train)) print('SGD Regression Test Score is : ',SGDRegressionModel.score(X_test, y_test)) print('SGD Regression Coef is : ', SGDRegressionModel.coef_) print('SGD Regression intercept is : ', SGDRegressionModel.intercept_) print('-'*25) # ---------------------------------------------------- # Calculating Prediction y_pred = SGDRegressionModel.predict(X_test) print('Pred Value for SGD Regression is : ', y_pred[:5]) print('True Value for SGD Regression is : ', y_test[:5]) # ---------------------------------------------------- # Calculating Mean Absolute Error MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average') # it can be raw_values print('Mean Absolute Error Value is : ', MAEValue) # ---------------------------------------------------- # Calculating Mean Squared Error MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average') # it can be raw_values print('Mean Squared Error Value is : ', MSEValue) # ---------------------------------------------------- # Calculating Median Squared Error MdSEValue = median_absolute_error(y_test, y_pred) print('Median Squared Error Value is : ', MdSEValue)
scaled = scaler.transform(X) scaled_df = pd.DataFrame(scaled, columns= X.columns) scaled_df[:5] X = scaled_df X[:5] from sklearn.linear_model import LinearRegression model = LinearRegression() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42) model.fit(X_train, y_train) pred = model.predict(X_test) from sklearn import metrics metrics.r2_score(y_test, pred) from sklearn.linear_model import SGDRegressor mod = SGDRegressor() mod.fit(X_train, y_train) predict = mod.predict(X_test) metrics.r2_score(y_test, predict)
pprint.pprint("Testing with test data...") test_data = list() test_diff = list() predict_diff = list() for index in test_indices: tmp = data[index][1:5] my_tmp = list() for item in tmp: my_tmp.append(float(item)) test_data.append(my_tmp) test_diff.append(float(data[index][4]) - float(data[index][1])) # # prediction_results_close = clf.predict(test_data) prediction_results_open = clf2.predict(test_data) for i in xrange(len(prediction_results_close)): p_diff = prediction_results_close[i] - prediction_results_open[i] predict_diff.append(p_diff) print test_diff print predict_diff test_inc = 0 for diff in test_diff: if diff > 0: test_inc += 1
print('{:.4f}'.format(line.slope)) # 0.29 sms['line'] = line.predict(sms['topic4']) ########################## from sklearn.linear_model import SGDRegressor sgd = SGDRegressor(n_iter=20000) sgd = sgd.fit(sms[['topic4']], sms['vader']) print('{:.4f}'.format(sgd.coef_[0])) # 0.2930 sms['sgd'] = sgd.predict(sms[['topic4']]) ########################## from nlpia.models import OneNeuronRegressor nn = OneNeuronRegressor(alpha=100, n_iter=200) nn = nn.fit(sms[['topic4']], sms['vader']) print(nn.W[0, 1]) # 0.29386408 sms['neuron'] = nn.predict(sms[['topic4']]) ##########################
from sklearn.pipeline import Pipeline polynomial_regression = Pipeline([("poly_features", PolynomialFeatures(degree=10, include_bias=False)), ("lin_reg", LinearRegression()),]) plot_learning_curves(polynomial_regression, X, y) #Ridge Regression from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky") ridge_reg.fit(X, y) ridge_reg.predict([[1.5]]) #Stochastic Gradient Descent sgd_reg = SGDRegressor(penalty="12") sgd_reg.fit(X, y.ravel()) sgd_reg.predict([[1.5]]) #Build classifier to detect Iris-Virginia type from sklearn import datasets iris = datasets.load_iris() iris list(iris.keys()) X = iris["data"][:, 3:] #iris["data"] returns matrix. [:,3:] all rows and column from 3 until end column X y = (iris["target"] == 2).astype(np.int) #astype : y from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression() log_reg.fit(X, y)
ytrain, yval = y_train[train_index], y_train[val_index] model = SGDRegressor(penalty='l2', loss='squared_epsilon_insensitive', max_iter=200, tol=0.00001, epsilon=0.0001, learning_rate='invscaling', fit_intercept=False, alpha=1e-10, l1_ratio=0.09, shuffle=True, verbose=0, random_state=1001) model.fit(Xtrain, ytrain) sgd_scores_val = model.predict(Xval) sgd_RMSLE = np.sqrt(mean_squared_error(yval, sgd_scores_val)) print('\n Fold %02d SGD RMSLE: %.6f' % ((i + 1), sgd_RMSLE)) sgd_y_pred = model.predict(X_test) model = Ridge(alpha=4.75, solver='sag', fit_intercept=False, random_state=1001, max_iter=1000) model.fit(Xtrain, ytrain) ridge_scores_val = model.predict(Xval) ridge_RMSLE = np.sqrt(mean_squared_error(yval, ridge_scores_val)) print(' Fold %02d Ridge RMSLE: %.6f' % ((i + 1), ridge_RMSLE)) ridge_y_pred = model.predict(X_test)
for l in trainf.readlines(): sl = l.strip().split() L.append(sl[0]) Y.append(int(sl[1])) xx=map(float,sl[2:]) X.append(xx) #print X clf = SGDRegressor(loss='squared_epsilon_insensitive',n_iter=1000) clf = clf.fit(X, Y) #print clf #scores = cross_val_score(clf, X, Y) #print scores #print clf.score(X,Y) print clf.coef_ YY=clf.predict(X) print roc_auc_score(Y,YY) del X pf=open('clf-linearReg.pkl','w') s = pickle.dump(clf, pf) pf.close() X=[] L=[] testf=open(sys.argv[2]) for l in testf.readlines(): sl = l.strip().split() L.append(sl[0]) xx=map(float,sl[1:])
test_size=0.2, random_state=1) test_data = pd.DataFrame.from_records(X_t) test_data.to_csv('./Titles1/testing' + str(step) + '.csv', header=False, index=False) with open("./Titles1/testing_label" + str(step) + ".csv", "w") as f: wr = csv.writer(f, delimiter="\n") wr.writerow(y_t) lr.partial_fit(X, y) ## not overwrite the model's previous parameters print('Step {} is done!\n'.format(step)) #### Test the training dataset ## The last X and y predictions = lr.predict(X) print('predictions: ', predictions[0:10]) print('the true upvote: ', y[0:10]) mse = mean_squared_error(predictions, y) print(mse) ############################################################################## # save the model to disk filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) ###### load the model from disk #loaded_model = pickle.load(open(filename, 'rb')) #result = loaded_model.predict(X_test, Y_test) mse_list = []
Ordinary Least Squares with SGD =============================== Simple Ordinary Least Squares example with stochastic gradient descent, we draw the linear least squares solution for a random set of points in the plane. """ print __doc__ import numpy as np import pylab as pl from sklearn.linear_model import SGDRegressor # this is our test set, it's just a straight line with some # gaussian noise xmin, xmax = -5, 5 n_samples = 100 X = [[i] for i in np.linspace(xmin, xmax, n_samples)] Y = 2 + 0.5 * np.linspace(xmin, xmax, n_samples) \ + np.random.randn(n_samples, 1).ravel() # run the classifier clf = SGDRegressor(alpha=0.1, n_iter=20) clf.fit(X, Y) # and plot the result pl.scatter(X, Y, color='black') pl.plot(X, clf.predict(X), color='blue', linewidth=3) pl.show()
print(' ') print(' REGRESSOR SGD:') print(' ') regressor_sgd = SGDRegressor( loss='squared_loss', alpha=0.1, penalty='l2', tol=1e-5, max_iter=100000, ) regressor_sgd = regressor_sgd.fit(x_treino, y_treino) y_resposta_treino = regressor_sgd.predict(x_treino) y_resposta_teste = regressor_sgd.predict(x_teste) print(' Métrica DENTRO da amostra FORA da amostra ') print(' ------- ----------------- --------------- ') mse_in = mean_squared_error(y_treino, y_resposta_treino) rmse_in = math.sqrt(mse_in) r2_in = r2_score(y_treino, y_resposta_treino) mse_out = mean_squared_error(y_teste, y_resposta_teste) rmse_out = math.sqrt(mse_out) r2_out = r2_score(y_teste, y_resposta_teste) print(' %7s %17.4f %15.4f ' % ('mse', mse_in, mse_out)) print(' %7s %17.4f %15.4f ' % ('rmse', rmse_in, rmse_out))
X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test) #print(X_train) #X_values = np.delete(raw_data, raw_data.shape[1]-1, 1) #Y_values = raw_data[:,raw_data.shape[1]-1] weights_sk = np.full( (1, X_train.shape[1]), 1.0 ) #do not reuse the weights since sk-learn does inplace work with the coef_init matrix! intercept_sk = 1 weights_own = np.full((1, X_train.shape[1]), 1.0) intercept_own = 1 sk_gdc = SGDRegressor() sk_gdc.fit( X_train, y_train, coef_init=weights_sk, intercept_init=intercept_sk ) #coef_init is the same as our weights for comparison reasons (sklear does not pass w_0!) print("Weights and intercept found by sk:", weights_sk, intercept_sk) own_gdc = OwnGradientDescentRegressor(debug_output=True) print(weights_own, weights_own.shape) weights_own, intercept_own = own_gdc.fit(X_train, y_train, coef_init=weights_own, intercept_init=intercept_own) print("Weights and intercept found by own:", weights_own, intercept_own) print("Prediction with sk-learn:", sk_gdc.predict(X_test)) print("Prediction with own-imp:", own_gdc.predict(X_test))
X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test) y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) #修改1 y_test = ss_y.transform(y_test.reshape(-1, 1)) #修改2 #使用线性回归模型 from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X_train, y_train) lr_y = lr.predict(X_test) #使用随机梯度下降即SGD模型 from sklearn.linear_model import SGDRegressor sgdr = SGDRegressor(max_iter=5) #参数增添修改可选,否则warning sgdr.fit(X_train, y_train.ravel()) #使用扁平化函数ravel,否则warning shdr_y = sgdr.predict(X_test) #LinearRegression模型自带的评估函数 print('The value of default measurement of LinearRegression is ', lr.score(X_test, y_test)) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error #使用r2_score模块 print('The value of R_squared of LinearRegression is ', r2_score(y_test, lr_y)) #使用mean_squared_error模块 print( 'The mean squared error of LinearRegression is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y)))