def compare_panorama_cubic(greenery_measure="vegetation", **kwargs): """ Compare/plot the segmentation results of panoramic and cubic images to each other. Also use linear regression to determine how they relate to each other. """ green_kwargs = select_green_model(greenery_measure) panorama_tiler = TileManager(cubic_pictures=False, **kwargs, **green_kwargs) cubic_tiler = TileManager(cubic_pictures=True, **kwargs, **green_kwargs) panorama_green = panorama_tiler.green_direct() cubic_green = cubic_tiler.green_direct() _remove_missing(panorama_green, cubic_green) x = np.arange(0, 0.8, 0.01) x_pano = np.array(panorama_green["green"]).reshape(-1, 1) y_cubic = np.array(cubic_green["green"]) reg = LinearRegression().fit(x_pano, y_cubic) print(reg.score(x_pano, y_cubic)) print(reg.coef_[0], reg.intercept_) plt.figure() plt.scatter(panorama_green["green"], cubic_green["green"]) plt.plot(x, reg.predict(x.reshape(-1, 1))) plt.xlabel("panoramas") plt.ylabel("cubic") plt.xlim(0, max(0.001, max(panorama_green["green"])*1.1)) plt.ylim(0, max(0.001, max(cubic_green["green"])*1.1)) plot_greenery(panorama_green, show=False, title="panorama") plot_greenery(cubic_green, show=False, title="cubic") plt.show()
def test_linear_regression_sample_weights(): # TODO: loop over sparse data as well rng = np.random.RandomState(0) # It would not work with under-determined systems for n_samples, n_features in ((6, 5), ): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for intercept in (True, False): # LinearRegression with explicit sample_weight reg = LinearRegression(fit_intercept=intercept) reg.fit(X, y, sample_weight=sample_weight) coefs1 = reg.coef_ inter1 = reg.intercept_ assert_equal(reg.coef_.shape, (X.shape[1], )) # sanity checks assert_greater(reg.score(X, y), 0.5) # Closed form of the weighted least square # theta = (X^T W X)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) coefs2 = linalg.solve( X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs1, coefs2) else: assert_array_almost_equal(coefs1, coefs2[1:]) assert_almost_equal(inter1, coefs2[0])
def test_score(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, fit it, store in Omega lr = LinearRegression() lr.fit(X, Y) scores = lr.score(X, Y) om.models.put(lr, 'mymodel')
def test_linear_regression_sample_weights(): # TODO: loop over sparse data as well rng = np.random.RandomState(0) # It would not work with under-determined systems for n_samples, n_features in ((6, 5), ): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for intercept in (True, False): # LinearRegression with explicit sample_weight reg = LinearRegression(fit_intercept=intercept) reg.fit(X, y, sample_weight=sample_weight) coefs1 = reg.coef_ inter1 = reg.intercept_ assert_equal(reg.coef_.shape, (X.shape[1], )) # sanity checks assert_greater(reg.score(X, y), 0.5) # Closed form of the weighted least square # theta = (X^T W X)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) coefs2 = linalg.solve(X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs1, coefs2) else: assert_array_almost_equal(coefs1, coefs2[1:]) assert_almost_equal(inter1, coefs2[0])
def test_linear_regression_sample_weights(): rng = np.random.RandomState(0) for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) clf = LinearRegression() clf.fit(X, y, sample_weight) coefs1 = clf.coef_ assert_equal(clf.coef_.shape, (X.shape[1], )) assert_greater(clf.score(X, y), 0.9) assert_array_almost_equal(clf.predict(X), y) # Sample weight can be implemented via a simple rescaling # for the square loss. scaled_y = y * np.sqrt(sample_weight) scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis] clf.fit(X, y) coefs2 = clf.coef_ assert_array_almost_equal(coefs1, coefs2)
#print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #增加特征多项式让线性回归模型更好地拟合数据 #多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合 poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #多项式线性回归 model2 = LinearRegression(normalize=True) model2.fit(X_train_poly, y_train) mutilScore = model2.score(X_test_poly, y_test) print(mutilScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred = model2.predict(X_test_poly) print("MSE:", metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted = cross_val_predict(model2, boston.data, boston.target, cv=10) print("MSE:", metrics.mean_squared_error(boston.target, predicted)) #画图 import matplotlib.pyplot as plt plt.scatter(boston.target, predicted, color="y", marker="o") plt.scatter(boston.target, boston.target, color="g", marker="+")
boston = load_boston() #print(boston) #通过DESCR属性可以查看数据集的详细情况,这里数据有14列,前13列为特征数据,最后一列为标签数据。 #print(boston.DESCR) #boston的data和target分别存储了特征和标签 #print(boston.data) #print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #简单线性回归 model1 = LinearRegression(normalize=True) model1.fit(X_train, y_train) #模型的拟合优度 simpleScore=model1.score(X_test, y_test) print(simpleScore) ##回归系数 #print(model1.coef_) #截距项 #print(model1.intercept_) #print(simpleScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred=model1.predict(X_test) print("MSE:",metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted=cross_val_predict(model1, boston.data, boston.target, cv=10) print ("MSE:", metrics.mean_squared_error(boston.target, predicted))
plt.scatter( feature, target, color=test_color ) for feature, target in zip(feature_train, target_train): plt.scatter( feature, target, color=train_color ) ### labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(feature_train, target_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(feature_test, target_test)) ### draw the regression line, once it's coded try: plt.plot( feature_test, reg.predict(feature_test) ) except NameError: pass reg.fit(feature_test, target_test) plt.plot(feature_train, reg.predict(feature_train), color="b") plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show() print("Slope2 %s" % reg.coef_) print("Intercept2 %s" % reg.intercept_)
### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) from sklearn.cross_validation import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(ages_test, net_worths_test)) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points
# labels y = np.array(df['label']) """ This next section jumbles the rows, but keeps the relationship between X and y. This is so that we can train the linearRegression model, and then test it on different data so that we know that it is now able to get the answers right! """ X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2) # Create and train a classifier clf = LinearRegression(n_jobs=-1) # training data clf.fit(X_train, y_train) # test the data accuracy = clf.score(X_test, y_test) # predict future <forecast_col> values forecast_set = clf.predict(X_lately) df['Forecast'] = np.nan # set up dates to use on the graph last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += one_day
# Splitting data into test_random_forest and train # train_set, test_set = train_test_split(data_df, test_size=0.01, random_state=np.random.randint(1, 1000)) # Removing all unused variable for memory management # Separate output from inputs y_train = data_df['time_to_failure'] x_train_seg = data_df['segment_id'] x_train = data_df.drop(['time_to_failure','segment_id'], axis=1) # y_test = test_set['time_to_failure'] # x_test_seg = test_set['segment_id'] # x_test = test_set.drop(['time_to_failure'], axis=1) # x_test = x_test.drop(['segment_id'], axis=1) model = LinearRegression(n_jobs=4) model.fit(x_train, y_train) mh = ModelHolder(model, most_dependent_columns) mh.save(model_name) model = None mh_new = load_model(model_name) model, most_dependent_columns = mh_new.get() print('Evaluating test data , transforming test data now ... ') print('Calculating score and error .. ') y_pred = model.predict(x_train) print('Score', model.score(x_train, y_train)) mas = mean_absolute_error(y_train, y_pred) print('Mean Absolute Error', mas)