def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0): "Test multiple-outcome linear regressions" X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0): "Test multiple-outcome linear regressions with sparse data" random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert_equal(ols.coef_.shape, (2, n_features)) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert_equal(ols.coef_.shape, (2, n_features)) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def MethodSelect(Xt, XT, Yt, YT): reg = LinearRegression() reg.fit(Xt, Yt) predict = reg.predict(XT) err = dp.rmsErr(predict, YT) if err > 100: a = XT[0] b = Xt[0] c = [a] pre = reg.predict(c) print(pre[0]) for i in range(0, len(a)): print(a[i], b[i]) print('\n\n\n') return err
class PredictLoss(BaseLR): def __init__(self, hist=30, posmax=15, lr=0.2): from sklearn.linear_model.base import LinearRegression from collections import deque self.hist = hist self.track = deque(maxlen=self.hist) self.regr = LinearRegression() self.poscases = 0 self.posmax = posmax self.lr = lr def __call__(self, env): if len(self.track) > 5: y = np.array(self.track) x = np.array(range(len(y.shape))).reshape(-1, 1) self.regr.fit(x, y) coef_ = self.regr.coef_[0] preds = self.regr.predict(x) fst = preds[0] lst = preds[-1] e = np.sqrt(((y - preds)**2).mean()) if coef_ > 0: self.poscases += 1 if self.poscases >= self.posmax: raise EarlyStopException else: self.poscases -= 1 if self.poscases < 0: self.poscases = 0 diff = np.abs(fst - lst) coef = np.clip(diff/e, 1e-6, 1) lr = self.lr*coef print(lr, e, diff, coef_, coef, file=open('log.txt', 'a')) env.model.set_param("learning_rate", lr)
def linearRegression_sales(self): #线性回归 path = u'4.Advertising.csv' data = self.readFile(path) # x=data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) # print x_train, y_train linreg = LinearRegression() model = linreg.fit(x_train, y_train) print model print linreg.coef_ print linreg.intercept_ y_hat = linreg.predict(np.array(x_test)) mse = np.average((y_hat - y_test)**2) rmse = np.sqrt(mse) print mse, rmse t = np.arange(len(x_test)) plt.plot(t, y_test, 'r-', linewidth=2, label='Test') plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') plt.grid() plt.legend(loc='upper right') plt.show()
def test_predict_hdf_dataframe(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df['x'] Y = df['y'] # put into Omega -- assume a client with pandas, scikit learn os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.pure_python = True om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax', as_hdf=True) om.datasets.put(Y, 'datay', as_hdf=True) # have Omega fit the model then predict lr = LinearRegression() lr.fit(reshaped(X), reshaped(Y)) pred = lr.predict(reshaped(X)) om.models.put(lr, 'mymodel2') # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').predict('datax') pred2 = result.get() self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def compare_panorama_cubic(greenery_measure="vegetation", **kwargs): """ Compare/plot the segmentation results of panoramic and cubic images to each other. Also use linear regression to determine how they relate to each other. """ green_kwargs = select_green_model(greenery_measure) panorama_tiler = TileManager(cubic_pictures=False, **kwargs, **green_kwargs) cubic_tiler = TileManager(cubic_pictures=True, **kwargs, **green_kwargs) panorama_green = panorama_tiler.green_direct() cubic_green = cubic_tiler.green_direct() _remove_missing(panorama_green, cubic_green) x = np.arange(0, 0.8, 0.01) x_pano = np.array(panorama_green["green"]).reshape(-1, 1) y_cubic = np.array(cubic_green["green"]) reg = LinearRegression().fit(x_pano, y_cubic) print(reg.score(x_pano, y_cubic)) print(reg.coef_[0], reg.intercept_) plt.figure() plt.scatter(panorama_green["green"], cubic_green["green"]) plt.plot(x, reg.predict(x.reshape(-1, 1))) plt.xlabel("panoramas") plt.ylabel("cubic") plt.xlim(0, max(0.001, max(panorama_green["green"])*1.1)) plt.ylim(0, max(0.001, max(cubic_green["green"])*1.1)) plot_greenery(panorama_green, show=False, title="panorama") plot_greenery(cubic_green, show=False, title="cubic") plt.show()
def get_scikit_prediction(x=np.array([1, 2, 3]), y=np.array([1, 2, 3])): from sklearn.linear_model.base import LinearRegression as ScikitLinearRegression regression = ScikitLinearRegression() regression.fit(x, y) return regression.predict(x)
def train(): X = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]) y = np.array([10, 20, 30]) X_test = np.array([[10, 20, 30, 40], [40, 50, 60, 70], [70, 80, 90, 100]]) reg = LinearRegression() reg.fit(X, y) print('coef_:', reg.coef_) print('intercept_:', reg.intercept_) print('predict:', reg.predict(X_test))
def test_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega lr = LinearRegression() om.models.put(lr, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison lr.fit(X, Y) pred = lr.predict(X) # try predicting without fitting with self.assertRaises(NotFittedError): result = om.runtime.model('mymodel2').predict('datax') result.get() # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) # -- using data already in Omega result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').fit(X, Y) result = om.runtime.model('mymodel2').predict(X) pred2 = result.get() # -- check the local data provided to fit was stored as intended meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) self.assertIn('_fitX', meta.attributes.get('metaX').get('collection')) self.assertIn('_fitY', meta.attributes.get('metaY').get('collection')) self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def test_linear_regression(): # Test LinearRegression on a simple dataset. # a simple dataset X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [1, 2]) # test it also for degenerate input X = [[1]] Y = [0] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [0]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [0])
def test_linear_regression(): # Test LinearRegression on a simple dataset. # a simple dataset X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [1, 2]) # test it also for degenerate input X = [[1]] Y = [0] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [0]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [0])
def test_linear_regression_sparse(random_state=0): # Test that linear regression also works with sparse data random_state = check_random_state(random_state) for i in range(10): n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
def test_linear_regression_sparse(random_state=0): # Test that linear regression also works with sparse data random_state = check_random_state(random_state) for i in range(10): n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
def eval_linear(data_set, test_size=0.4): # load training data from feature matrix x, y = data_set.load_training_data() # cross validation evaluation model = LinearRegression(normalize=True) #model = RFE(model, 10) score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # to visualize: # split data into train and test set x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True, random_state=0) # train model on train set model = LinearRegression(normalize=True) model = model.fit(x_train, y_train) print(model.coef_) pprint(model) # plot train performance predict_train = model.predict(x_train) plt.figure() plt.title('train') plt.scatter(y_train, predict_train) # plot test performance predict = model.predict(x_test) plt.figure() plt.title('test') plt.scatter(y_test, predict) plt.show()
class LinearRegressionImpl(): def __init__(self, fit_intercept=True, normalize=False, copy_X=True, n_jobs=None): self._hyperparams = { 'fit_intercept': fit_intercept, 'normalize': normalize, 'copy_X': copy_X, 'n_jobs': n_jobs} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def test_predict(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, fit it, store in Omega lr = LinearRegression() lr.fit(X, Y) pred = lr.predict(X) om.models.put(lr, 'mymodel') self.assertIn('mymodel', om.models.list('*')) # have Omega predict it # -- using data already in Omega result = om.runtime.model('mymodel').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel').predict('foo') result = om.runtime.model('mymodel').predict(X) pred2 = result.get() self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
class StackedRegression(LinearModel, RegressorMixin): def __init__(self, weights=None, cv_train_size=None): estimators = [] estimators.append(KNeighborsRegressor(n_neighbors=3)) estimators.append(DecisionTreeRegressor()) estimators.append(BayesianRidge()) # estimators.append(BayesianRidge()) self.estimators = estimators self.stacker = LinearRegression() self.weights = weights if weights is not None else {} self.cv_train_size = cv_train_size if cv_train_size is not None else 0.7 self._is_fitted = False def fit_stack(self, X, y): print('fitting') print(X.shape) n_train = int(X.shape[0] * self.cv_train_size) for estimator in self.estimators: estimator.fit(X[:n_train, :], y[:n_train]) predictions = np.concatenate([np.matrix(estimator.predict(X[n_train:, :])).transpose() for estimator in self.estimators], axis=1) self.stacker.fit(predictions, y[n_train:]) self._is_fitted = True print('fitted') print(self.stacker.residues_) def fit(self, X, y): if not self._is_fitted: raise NotFittedError('StackedRegression must call fit_stack before fit.') for estimator in self.estimators: estimator.fit(X, y) def predict(self, X): predictions = np.concatenate([np.matrix(estimator.predict(X)).transpose() for estimator in self.estimators], axis=1) return self.stacker.predict(predictions)
def test_linear_regression_sample_weights(): rng = np.random.RandomState(0) for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) clf = LinearRegression() clf.fit(X, y, sample_weight) coefs1 = clf.coef_ assert_equal(clf.coef_.shape, (X.shape[1], )) assert_greater(clf.score(X, y), 0.9) assert_array_almost_equal(clf.predict(X), y) # Sample weight can be implemented via a simple rescaling # for the square loss. scaled_y = y * np.sqrt(sample_weight) scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis] clf.fit(X, y) coefs2 = clf.coef_ assert_array_almost_equal(coefs1, coefs2)
def test_linear_regression_sample_weights(): rng = np.random.RandomState(0) for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) clf = LinearRegression() clf.fit(X, y, sample_weight) coefs1 = clf.coef_ assert_equal(clf.coef_.shape, (X.shape[1], )) assert_greater(clf.score(X, y), 0.9) assert_array_almost_equal(clf.predict(X), y) # Sample weight can be implemented via a simple rescaling # for the square loss. scaled_y = y * np.sqrt(sample_weight) scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis] clf.fit(X, y) coefs2 = clf.coef_ assert_array_almost_equal(coefs1, coefs2)
from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(ages_test, net_worths_test)) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages_train) cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers"
plt.scatter( feature, target, color=test_color ) for feature, target in zip(feature_train, target_train): plt.scatter( feature, target, color=train_color ) ### labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(feature_train, target_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(feature_test, target_test)) ### draw the regression line, once it's coded try: plt.plot( feature_test, reg.predict(feature_test) ) except NameError: pass reg.fit(feature_test, target_test) plt.plot(feature_train, reg.predict(feature_train), color="b") plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show() print("Slope2 %s" % reg.coef_) print("Intercept2 %s" % reg.intercept_)
def get_cv_error(x_train, x_test, y_train, y_test): model = LinearRegression(normalize=True) model.fit(x_train, y_train) predict = model.predict(x_test) return np.average(np.abs(y_test - predict))
def get_error(x, y): model = LinearRegression(normalize=True) model.fit(x, y) predict = model.predict(x) return np.average(np.abs(y - predict))
def draw_data_size_vs_performance_chart(): ''' Create figure for paper ''' paths = glob('output/data-sizes/*.results/*/results.json') + \ glob('output/model-h2048p512-mfs-true.results/*/results.json') df = read_json_files(paths) def parse_path(val): if 'model-h2048p512' in val: return 100 else: return float(re.search(r'(\d+)\.results', val).group(1)) df['data-pct'] = df['path'].apply(parse_path) df['words'] = 1.8e9 * df['data-pct'] / 100 df = df.append([{ 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "SemEval13", 'F1': 0.670 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "SemEval13", 'F1': 0.673 }, { 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "Senseval2", 'F1': 0.736 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "SemEval13", 'F1': 0.673 }, { 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "Senseval2", 'F1': 0.736 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "Senseval2", 'F1': 0.724 }]) print(df) def get_xy(competition, model): sub_df = df[df['model'].str.contains(model, regex=False)] sub_df = sub_df.query('competition == "%s"' % competition).sort_values('words') return sub_df['words'], sub_df['F1'] with PdfPages('output/data_size_vs_performance.pdf') as pdf: se13_semcor_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor)'), '-o', label='SemEval13 (T: SemCor)') se13_mun_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor+OMSTI)'), '--o', label='SemEval13 (T: OMSTI)') se2_semcor_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor)'), ':o', label='Senseval2 (T: SemCor)') se2_mun_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor+OMSTI)'), '-.o', label='Senseval2 (T: OMSTI)') plt.legend(handles=[ se13_semcor_handle, se13_mun_handle, se2_semcor_handle, se2_mun_handle ], loc='lower right') plt.axis([1.5e7, 1.1e11, 0, 1]) plt.ylabel('F1') plt.xlabel('Tokens') plt.xscale('log') pdf.savefig() plt.show() plt.close() # extrapolate from data lr = LinearRegression() words, f1s = get_xy('SemEval13', 'Our LSTM (T: SemCor)') lr.fit(f1s.values.reshape([-1, 1]), np.log10(words.values.reshape([-1, 1]))) print('Extrapolated data size (words):') print(lr.predict([[0.75], [0.8]]))
#print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #简单线性回归 model1 = LinearRegression(normalize=True) model1.fit(X_train, y_train) #模型的拟合优度 simpleScore=model1.score(X_test, y_test) print(simpleScore) ##回归系数 #print(model1.coef_) #截距项 #print(model1.intercept_) #print(simpleScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred=model1.predict(X_test) print("MSE:",metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted=cross_val_predict(model1, boston.data, boston.target, cv=10) print ("MSE:", metrics.mean_squared_error(boston.target, predicted)) #画图 import matplotlib.pyplot as plt plt.scatter(boston.target, predicted, color="y", marker="o") plt.scatter(boston.target, boston.target, color="g", marker="+") plt.show()
This next section jumbles the rows, but keeps the relationship between X and y. This is so that we can train the linearRegression model, and then test it on different data so that we know that it is now able to get the answers right! """ X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2) # Create and train a classifier clf = LinearRegression(n_jobs=-1) # training data clf.fit(X_train, y_train) # test the data accuracy = clf.score(X_test, y_test) # predict future <forecast_col> values forecast_set = clf.predict(X_lately) df['Forecast'] = np.nan # set up dates to use on the graph last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += one_day df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i] # plot it
# Splitting data into test_random_forest and train # train_set, test_set = train_test_split(data_df, test_size=0.01, random_state=np.random.randint(1, 1000)) # Removing all unused variable for memory management # Separate output from inputs y_train = data_df['time_to_failure'] x_train_seg = data_df['segment_id'] x_train = data_df.drop(['time_to_failure','segment_id'], axis=1) # y_test = test_set['time_to_failure'] # x_test_seg = test_set['segment_id'] # x_test = test_set.drop(['time_to_failure'], axis=1) # x_test = x_test.drop(['segment_id'], axis=1) model = LinearRegression(n_jobs=4) model.fit(x_train, y_train) mh = ModelHolder(model, most_dependent_columns) mh.save(model_name) model = None mh_new = load_model(model_name) model, most_dependent_columns = mh_new.get() print('Evaluating test data , transforming test data now ... ') print('Calculating score and error .. ') y_pred = model.predict(x_train) print('Score', model.score(x_train, y_train)) mas = mean_absolute_error(y_train, y_pred) print('Mean Absolute Error', mas)
data_set.loc[data_set[EMBARKED] == 'Q', EMBARKED] = 2 # print(data_set.describe()) algorithm = LinearRegression() kf = KFold(data_set.shape[0], n_folds=3, random_state=2) predictors = [GENDER, 'Pclass'] # predictors = [GENDER, 'Age'] predictions = [] for train, test in kf: train_predictors = (data_set[predictors].iloc[train, :]) train_target = data_set['Survived'].iloc[train] algorithm.fit(train_predictors, train_target) test_prediction = algorithm.predict(data_set[predictors].iloc[test, :]) predictions.append(test_prediction) predictions = np.concatenate(predictions, axis=0) predictions = predictions > 0.5 accuracy = np.sum(data_set['Survived'] == predictions)/data_set.shape[0] print(accuracy)
from sklearn.datasets.base import load_boston from sklearn.linear_model.base import LinearRegression boston_data = load_boston() x = boston_data['data'] y = boston_data['target'] model = LinearRegression() model.fit(x, y) sample_house = [[ 2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01, 6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02, 1.86000000e+01, 3.94870000e+02, 1.09700000e+01 ]] prediction = model.predict(sample_house) print(prediction)
inp_prices = list() features = list() def get_inp_features(self): return self.inp_features def get_inp_prices(self): return self.inp_prices def get_features(self): return self.features def read(self): F, N = map(int, raw_input().split(' ')) for _ in range(N): inp_f = map(float, raw_input().strip().split()) self.inp_features.append(inp_f[:F:]) self.inp_prices.append(inp_f[F::]) questions = int(raw_input()) for _ in range(questions): self.features.append(map(float, raw_input().split())) reader = inp_reader() reader.read() inp_features = reader.get_inp_features() inp_prices = reader.get_inp_prices() features = reader.get_features() model = LinearRegression() model.fit(inp_features, inp_prices) prices=model.predict(features) for el in prices: print (el[0])
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #增加特征多项式让线性回归模型更好地拟合数据 #多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合 poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #多项式线性回归 model2 = LinearRegression(normalize=True) model2.fit(X_train_poly, y_train) mutilScore = model2.score(X_test_poly, y_test) print(mutilScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred = model2.predict(X_test_poly) print("MSE:", metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted = cross_val_predict(model2, boston.data, boston.target, cv=10) print("MSE:", metrics.mean_squared_error(boston.target, predicted)) #画图 import matplotlib.pyplot as plt plt.scatter(boston.target, predicted, color="y", marker="o") plt.scatter(boston.target, boston.target, color="g", marker="+") plt.show()
def main(): df = load_train_data() logger.info('column hash = %d', utils.column_hash(df)) df = preprocess.drop_column(df, 'fullVisitorId') df = preprocess.drop_column(df, 'sessionId') # debug_info(df) y = df['totals_transactionRevenue'] X = preprocess.drop_column(df, 'totals_transactionRevenue') # X, _, y, _ = utils.split_data(X, y, ratio=0.9, seed=42) # n_classes = 10 n_models = 100 y_max = y.max() for i in range(n_models): X_train, X_test, y_train, y_test = utils.split_data(X, y) logger.info('training') # y_train, quants = preprocess.make_class_target(y_train, n_classes) # logger.info('y_train.unique() = %s', y_train.unique()) # logger.info('quants = %s', quants) # y_train = preprocess.make_class_target2(y_train, y_max, n_classes) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) logger.info('X_train.shape = %s', X_train.shape) # cumulative = np.cumsum(pca.explained_variance_ratio_) # pylab.plot(cumulative, 'r-') # pylab.show() # model = build_classifier(X_train.shape[1], n_classes) model = build_regressor(X_train.shape[1]) EPOCHS = 100 early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) history = model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.1, verbose=0, callbacks=[early_stop, utils.EpochCallback()]) linear_model = LinearRegression() linear_model.fit(X_train, y_train) logger.info('predicting') logger.info('X_test.shape = %s', X_test.shape) X_test = scaler.transform(X_test) # y_classes = model.predict(X_test) # y_pred = postprocess.make_real_predictions(y_classes, quants) # y_pred = postprocess.make_real_predictions2(y_classes, y_max) y_pred = model.predict(X_test).flatten() y_linear_pred = linear_model.predict(X_test) rms = np.sqrt(mean_squared_error(y_test, y_pred)) linear_rms = np.sqrt(mean_squared_error(y_test, y_linear_pred)) logger.info('rms = %s', rms) logger.info('linear_rms = %s', linear_rms) # save_model(model, i, quants, scaler) save_model2(model, linear_model, i, y_max, scaler) # plot_history_classifier(history) plot_history_regressor(history) pylab.figure() pylab.scatter(y_pred, y_test, alpha=0.5) pylab.xlabel("pred") pylab.ylabel("test") hist_revenue(y_linear_pred, 'y_linear_pred') hist_revenue(y_pred, 'y_pred') hist_revenue(y_test, 'y_test') pylab.show()
import pandas as pd from matplotlib.pyplot import scatter, plot, show from sklearn.linear_model.base import LinearRegression bmi_life_data = pd.read_csv('bmi_to_life_expect.csv') x_data = bmi_life_data[['BMI']] y_data = bmi_life_data[['Life expectancy']] # print x_data, y_data bmi_life_model = LinearRegression() bmi_life_model.fit(x_data, y_data) laos_life_exp = bmi_life_model.predict([50.00]) print(laos_life_exp) scatter(x_data, y_data) plot(x_data, bmi_life_model.predict(x_data)) show()
def moudle_select(X, test_A, y, moudelselect, threshold=False, Rate=False): ''' Function :model X : train data test_A : predict data y : result label predict_A : predict data moudelselect : waht' model do you select? threshold:False Rate:False modelselect : 1,XGBRegressor 2,ensemble.RandomForestRegressor 3,linear_model.Lasso 4,LinearRegression 5,linear_model.BayesianRidge 6,DecisionTreeRegressor 7,ensemble.RandomForestRegressor 8,ensemble.GradientBoostingRegressor 9,ensemble.AdaBoostRegressor 10,BaggingRegressor 11,ExtraTreeRegressor 12,SVR 13,MLPRegressor other:MLPRegressor ''' mse = [] sum_mse = 0.0 predict_A = pd.DataFrame(np.zeros((100, 10))) for index in range(5): X_train, X_test, y_train, y_test = train_test_split(X, y) if (moudelselect == 1): model = xgb.XGBRegressor( model=xgb.XGBRegressor(max_depth=17, min_child_weigh=5, eta=0.025, gamma=0.06, subsample=1, learning_rate=0.1, n_estimators=100, silent=0, n_jobs=-1, objective='reg:linear')) elif (moudelselect == 2): model = ensemble.RandomForestRegressor( n_estimators=25, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=0.95, max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 3): model = linear_model.Lasso(alpha=0.1, max_iter=1000, normalize=False) elif (moudelselect == 4): model = LinearRegression(fit_intercept=False, n_jobs=1, normalize=False) elif (moudelselect == 5): model = linear_model.BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=500, normalize=False, tol=10, verbose=False) elif (moudelselect == 6): model = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=3, min_samples_split=0.1, min_samples_leaf=0.1, min_weight_fraction_leaf=0.1, max_features=None, random_state=None, max_leaf_nodes=None, presort=False) elif (moudelselect == 7): model = ensemble.RandomForestRegressor( n_estimators=1000, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 8): model = ensemble.GradientBoostingRegressor(n_estimators=800, learning_rate=0.1, max_depth=4, random_state=0, loss='ls') elif (moudelselect == 9): model = ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=120, learning_rate=1, loss='linear', random_state=None) elif (moudelselect == 10): model = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True) elif (moudelselect == 11): model = ExtraTreeRegressor(criterion='mse', splitter='random', max_depth=3, min_samples_split=0.1, min_samples_leaf=1, min_weight_fraction_leaf=0.01, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07) elif (moudelselect == 12): model = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.1, tol=0.001, C=1, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif (moudelselect == 13): model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: model = MLPRegressor(activation='relu', alpha=0.001, solver='lbfgs', max_iter=90, hidden_layer_sizes=(11, 11, 11), random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print("index: ", index, mean_squared_error(y_test, y_pred)) sum_mse += mean_squared_error(y_test, y_pred) # # if (threshold == False): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) else: if (mean_squared_error(y_test, y_pred) <= 0.03000): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) # if(Rate==False): # mse_rate = mse / np.sum(mse) # #predict_A = predict_A.ix[:,~(data==0).all()] # for index in range(len(mse_rate)): # y+=predict_A.ix[:,index]*mse_rate[index] # y = 0.0 mse = mse / np.sum(mse) mse = pd.Series(mse) mse_rate_asc = mse.sort_values(ascending=False) mse_rate_asc = mse_rate_asc.reset_index(drop=True) mse_rate_desc = mse.sort_values(ascending=True) indexs = list(mse_rate_desc.index) for index in range(len(mse)): y += mse_rate_asc.ix[index] * predict_A.ix[:, indexs[index]] print("y_predict_mean: ", y.mean()) print("y_predict_var: ", y.var()) y = pd.DataFrame(y) y.to_excel("H:/java/python/src/machinelearning/test/predict.xlsx", index=False) predict_A.to_excel( "H:/java/python/src/machinelearning/test/predict_testA.xlsx", index=False) print("Averge mse:", sum_mse / len(mse))