def resolution_estimate(raw_data, n_spectra=25): slopes = [] intercepts = [] for i in range(n_spectra): mzs, intensities = read_random_spectrum(raw_data) peak_positions = np.array(gradient(mzs, intensities)[-1]) intensities_at_peaks = intensities[peak_positions] high_intensity_threshold = np.percentile(intensities_at_peaks, 40) peak_positions = peak_positions[intensities[peak_positions] > high_intensity_threshold] resolutions = [] for i, peak_pos in enumerate(peak_positions): resolutions.append(resolution_at_peak(peak_pos, mzs, intensities)) resolutions = np.array(resolutions) mzs = mzs[peak_positions] mzs = mzs[resolutions > 0] resolutions = resolutions[resolutions > 0] ransac = RANSACRegressor() ransac.fit(np.log(mzs).reshape((-1,1)), np.log(resolutions).reshape((-1,1))) slope = ransac.estimator_.coef_[0][0] intercept = ransac.estimator_.intercept_[0] slopes.append(slope) intercepts.append(intercept) slope = np.median(slopes) intercept = np.median(intercepts) return lambda mz: np.exp(intercept + slope * np.log(mz))
def ransac_fit(X, y): ''' 一个强健的fit :return: ''' from sklearn.linear_model import RANSACRegressor ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) ransac.fit(X, y) # 输出斜率|截距等数据 print('Slope: %.3f' % ransac.estimator_.coef_[0]) print('Intercept: %.3f' % ransac.estimator_.intercept_) # plot inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers') plt.plot(line_X, line_y_ransac, color='red') plt.xlabel('Average number of rooms [RM]') plt.ylabel('Price in $1000\'s [MEDV]') plt.legend(loc='upper left') plt.show()
def train_RANSACRegressionModel( X, y, base_estimator=None, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, stop_n_inliers=inf, stop_score=inf, stop_probability=0.99, residual_metric=None, random_state=None, ): """ Train a RANSAC regression model """ model = RANSACRegressor( base_estimator=base_estimator, min_samples=min_samples, residual_threshold=residual_threshold, is_data_valid=is_data_valid, is_model_valid=is_model_valid, max_trials=max_trials, stop_n_inliers=stop_n_inliers, stop_score=stop_score, stop_probability=stop_probability, residual_metric=residual_metric, random_state=random_state, ) model = model.fit(X, y) return model
def identify_linear_outliers(pts, win_size=7): # this runs a sliding window across the trace, performing a RANSAC regression # for each window. A point is considered an outlier if the moving-RANSAC # never considers it an inlier. regressor = RANSACRegressor() x = np.arange(win_size, dtype=np.float64) x = np.expand_dims(x, axis=1) inlier_count = np.zeros_like(pts) npts = len(pts) for i in range(npts-win_size+1): y = pts[i:i+win_size] # RANSAC of this section of the trace try: regressor.fit(x, y) inlier_inds = regressor.inlier_mask_ except ValueError: # no consensus -- (almost) all the points were bad inlier_inds = [] # accumulate the number of times each point was an inlier for j, inlier in enumerate(inlier_inds): if inlier: inlier_count[i+j] += 1 # Note: the following line will always consider the first and last points outliers! # However, I don't think this will matter for downstream analysis. -BK outlier_mask = np.logical_or(inlier_count < 2, pts == 0) # outlier_inds = np.where(outlier_mask)[0] # # # points that are exactly zero are always considered outliers # outlier_inds = np.append(outlier_inds, np.where(pts==0)[0]) return outlier_mask
def get_outliers_by_ransac(self, table, column_indexes): ''' Get outliers using RANSAC regression, which deals better with large outliers in the y direction, and faster than Huber when the number of samples is very large. RANSAC outpus perfect precision (100%) but far from perfect recall (could be 50% - 60%) in our experiments. ''' X = table[ :, column_indexes[ :-1]].astype(float) X = utils.enforce_columns(X) y = table[ :, column_indexes[-1]].astype(float) # preprocessing doesn't make any difference for RANSAC in our experiments #x = preprocessing.minmax_scale(x) #y = preprocessing.minmax_scale(y) model_ransac = RANSACRegressor(LinearRegression()) model_ransac.fit(X, y) inlier_mask = model_ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) outliers = [idx for idx, val in enumerate(outlier_mask) if val] residuals = abs(model_ransac.predict(X) - y) confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9 return (outliers, confidences)
def test_ransac_stop_n_inliers(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, stop_n_inliers=2, random_state=0) ransac_estimator.fit(X, y) assert_equal(ransac_estimator.n_trials_, 1)
def test_ransac_sparse_csc(): X_sparse = sparse.csc_matrix(X) base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator.fit(X_sparse, y) ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) ref_inlier_mask[outliers] = False assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_predict(): X = np.arange(100)[:, None] y = np.zeros((100,)) y[0] = 1 y[1] = 100 base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.5, random_state=0) ransac_estimator.fit(X, y) assert_equal(ransac_estimator.predict(X), np.zeros(100))
def test_ransac_score(): X = np.arange(100)[:, None] y = np.zeros((100,)) y[0] = 1 y[1] = 100 base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.5, random_state=0) ransac_estimator.fit(X, y) assert_equal(ransac_estimator.score(X[2:], y[2:]), 1) assert_less(ransac_estimator.score(X[:2], y[:2]), 1)
def test_ransac_default_residual_threshold(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, random_state=0) # Estimate parameters of corrupted data ransac_estimator.fit(X, y) # Ground truth / reference inlier mask ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) ref_inlier_mask[outliers] = False assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def fit_plane(points): ''' fit a plane through a list of 3d points and return a, b, c, d that represents the plane as ax+by+cz+d=0 ''' X = [[p[0], p[1]] for p in points] X = np.matrix(X) y = [p[2] for p in points] model = RANSACRegressor(LinearRegression()) model.fit(X, y) d = list(model.estimator_.intercept_.flatten())[0] a, b = list(model.estimator_.coef_.flatten()) c = -1 return a, b, c, d
def test_ransac_max_trials(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, max_trials=0, random_state=0) assert_raises(ValueError, ransac_estimator.fit, X, y) ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, max_trials=11, random_state=0) assert getattr(ransac_estimator, 'n_trials_', None) is None ransac_estimator.fit(X, y) assert_equal(ransac_estimator.n_trials_, 2)
def regression_information(dem, bilinear_interpolation_results): dem_shape = dem.shape # print dem_shape dem = dem.flatten() bilinear_interpolation_results = bilinear_interpolation_results.flatten() alt_data = np.column_stack((dem, bilinear_interpolation_results)) alt_data = alt_data[np.where(alt_data[:, 0] > 0)] RANSAC_lr = RANSACRegressor(LinearRegression()) RANSAC_lr.fit(alt_data[:, 0:1], alt_data[:, 1]) predict_result = RANSAC_lr.predict(alt_data[:, 0:1]).transpose()[0] # print predict_result # print predict_result.shape residual = bilinear_interpolation_results - predict_result residual = np.reshape(residual, dem_shape) return RANSAC_lr, residual
def test_ransac_multi_dimensional_targets(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) # 3-D target values yyy = np.column_stack([y, y, y]) # Estimate parameters of corrupted data ransac_estimator.fit(X, yyy) # Ground truth / reference inlier mask ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) ref_inlier_mask[outliers] = False assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def _ransac_regression(pts, regressor): ransac = RANSACRegressor(regressor) x = np.array([a['peak_size'] for a in pts]) y = np.array([b['relative_peak_height'] for b in pts]) X = x[:, np.newaxis] ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ ransac_mse = mean_squared_error(y[inlier_mask], ransac.predict(X[inlier_mask])) ** .5 ransac_r2 = r2_score(y[inlier_mask], ransac.predict(X[inlier_mask])) return { 'intercept': ransac.estimator_.intercept_, 'r_squared': ransac_r2, 'slope': ransac.estimator_.coef_[0], 'sd': ransac_mse }
def test_ransac_none_estimator(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_none_estimator = RANSACRegressor(None, 2, 5, random_state=0) ransac_estimator.fit(X, y) ransac_none_estimator.fit(X, y) assert_array_almost_equal(ransac_estimator.predict(X), ransac_none_estimator.predict(X))
def test_ransac_fit_sample_weight(): ransac_estimator = RANSACRegressor(random_state=0) n_samples = y.shape[0] weights = np.ones(n_samples) ransac_estimator.fit(X, y, weights) # sanity check assert_equal(ransac_estimator.inlier_mask_.shape[0], n_samples) ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_ ).astype(np.bool_) ref_inlier_mask[outliers] = False # check that mask is correct assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where # X = X1 repeated n1 times, X2 repeated n2 times and so forth random_state = check_random_state(0) X_ = random_state.randint(0, 200, [10, 1]) y_ = np.ndarray.flatten(0.2 * X_ + 2) sample_weight = random_state.randint(0, 10, 10) outlier_X = random_state.randint(0, 1000, [1, 1]) outlier_weight = random_state.randint(0, 10, 1) outlier_y = random_state.randint(-1000, 0, 1) X_flat = np.append(np.repeat(X_, sample_weight, axis=0), np.repeat(outlier_X, outlier_weight, axis=0), axis=0) y_flat = np.ndarray.flatten(np.append(np.repeat(y_, sample_weight, axis=0), np.repeat(outlier_y, outlier_weight, axis=0), axis=0)) ransac_estimator.fit(X_flat, y_flat) ref_coef_ = ransac_estimator.estimator_.coef_ sample_weight = np.append(sample_weight, outlier_weight) X_ = np.append(X_, outlier_X, axis=0) y_ = np.append(y_, outlier_y) ransac_estimator.fit(X_, y_, sample_weight) assert_almost_equal(ransac_estimator.estimator_.coef_, ref_coef_) # check that if base_estimator.fit doesn't support # sample_weight, raises error base_estimator = Lasso() ransac_estimator = RANSACRegressor(base_estimator) assert_raises(ValueError, ransac_estimator.fit, X, y, weights)
def fit(self, angs, pts): print(angs.shape) print(pts.shape) model1 = RANSACRegressor(LinearRegression()) model2 = RANSACRegressor(LinearRegression()) model1.fit(angs[:,[0]], pts[:,0]) model2.fit(angs[:,[2]], pts[:,1]) self.m1, self.b1 = float(model1.estimator_.coef_), model1.estimator_.intercept_ self.m2, self.b2 = float(model2.estimator_.coef_), model2.estimator_.intercept_ print('Coefficients :') print(self.m1, self.b1, self.m2, self.b2)
def test_ransac_max_trials(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, max_trials=0, random_state=0) assert_raises(ValueError, ransac_estimator.fit, X, y) # there is a 1e-9 chance it will take these many trials. No good reason # 1e-2 isn't enough, can still happen # 2 is the what ransac defines as min_samples = X.shape[1] + 1 max_trials = _dynamic_max_trials( len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9) ransac_estimator = RANSACRegressor(base_estimator, min_samples=2) for i in range(50): ransac_estimator.set_params(min_samples=2, random_state=i) ransac_estimator.fit(X, y) assert_less(ransac_estimator.n_trials_, max_trials + 1)
pd.DataFrame(data).describe() from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, RANSACRegressor from sklearn.metrics import mean_squared_error x_tr, x_te, y_tr, y_te = train_test_split(data, target, test_size=0.2) lr = LinearRegression() lr.fit(x_tr, y_tr) mean_squared_error(lr.predict(x_tr), y_tr) mean_squared_error(lr.predict(x_te), y_te) Rr = RANSACRegressor() Rr.fit(x_tr, y_tr) mean_squared_error(Rr.predict(x_tr), y_tr) mean_squared_error(Rr.predict(x_te), y_te) from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor dr = DecisionTreeRegressor(max_features='sqrt') dr.fit(x_tr, y_tr) mean_squared_error(dr.predict(x_tr), y_tr) mean_squared_error(dr.predict(x_te), y_te)
def test_ransac_residual_loss(): loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1) loss_mono = lambda y_true, y_pred : np.abs(y_true - y_pred) yyy = np.column_stack([y, y, y]) base_estimator = LinearRegression() ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, loss=loss_multi1) ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, loss=loss_multi2) # multi-dimensional ransac_estimator0.fit(X, yyy) ransac_estimator1.fit(X, yyy) ransac_estimator2.fit(X, yyy) assert_array_almost_equal(ransac_estimator0.predict(X), ransac_estimator1.predict(X)) assert_array_almost_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X)) # one-dimensional ransac_estimator0.fit(X, y) ransac_estimator2.loss = loss_mono ransac_estimator2.fit(X, y) assert_array_almost_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X)) ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, loss="squared_loss") ransac_estimator3.fit(X, y) assert_array_almost_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))
X = df[['RM']].values y = df['MEDV'].values from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y) from sklearn.linear_model import LinearRegression from sklearn.linear_model import RANSACRegressor import numpy as np ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers')
y_test = np.sin(X_test) X_test = X_test[:, np.newaxis] y_errors = y.copy() y_errors[::3] = 3 X_errors = X.copy() X_errors[::3] = 3 y_errors_large = y.copy() y_errors_large[::3] = 10 X_errors_large = X.copy() X_errors_large[::3] = 10 estimators = [('OLS', LinearRegression()), ('Theil-Sen', TheilSenRegressor(random_state=42)), ('RANSAC', RANSACRegressor(random_state=42)), ('HuberRegressor', HuberRegressor())] colors = { 'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen', 'HuberRegressor': 'black' } linestyle = { 'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--', 'HuberRegressor': '--' } lw = 3
def ransac_regressor(self): x_train, x_test, y_train, y_test = self.preprocessing() model = RANSACRegressor() y_pred = model.fit(x_train, y_train).predict(x_test) self.printing(y_test, y_pred, 'RANSAC')
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.linear_model import RANSACRegressor from boston_dataset import BostonDataset boston = BostonDataset() RANSAC = RANSACRegressor() df = boston.df # Feature matrix, target vector X = df['RM'].values.reshape(-1, 1) y = df['MEDV'].values # Fit model RANSAC.fit(X, y) # [boolean] inlier_mask = RANSAC.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) # [3, 4, 5, 6, 7, 8, 9] rooms line_y_ransac = RANSAC.predict(line_X.reshape(-1, 1)) def test(): # print(inlier_mask)
t_pca = [t0_pca, t1_pca] min_curvature = np.empty([2, 3]) max_curvature = np.empty([2, 3]) saddle = np.empty([2, 3]) for objIndex in [0, 1]: print "Time: {}".format(timer() - start) print "Fit polynomial" if regression_method.lower() in TS_SPECIFIERS: model = Pipeline([('poly', PolynomialFeatures(degree=order)), ('regr', TheilSenRegressor())]) elif regression_method.lower() in RANSAC_SPECIFIERS: model = Pipeline([('poly', PolynomialFeatures(degree=order)), ('regr', RANSACRegressor())]) else: model = Pipeline([('poly', PolynomialFeatures(degree=order)), ('regr', LinearRegression(fit_intercept=False))]) # z as a response to x, y (coords in pca, z-3rd component) model = model.fit(p_pca[objIndex][:, :2], p_pca[objIndex][:, 2]) # coefficients of the polynomial if regression_method.lower() in RANSAC_SPECIFIERS: C = model.named_steps['regr'].estimator_.coef_ else: C = model.named_steps['regr'].coef_ print "Coefficients: " print C
可以用在非线性或线性数据集上,具体算法过程看官网吧 2、TheilSen,广义中位数评估 适合小数据,适合处理在X上的中等异常,如果特征数增加到一定程度时并没有好过最小二乘 能接受最大X的29%被破坏 3、Huber,大于线性损失的被认为是异常值的样本 如果样本数>>特征数,则最快 如果啥都不懂就用RANSAC.... ''' rg_1 = RANSACRegressor(base_estimator=None, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, stop_n_inliers=inf, stop_score=inf, stop_probability=0.99, residual_metric=None, loss='absolute_loss', random_state=None) rg_2 = TheilSenRegressor(fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, max_iter=300, tol=0.001, random_state=None, n_jobs=1, verbose=False) rg_3 = HuberRegressor(epsilon=1.35,
def get_algorithm(self): ''' Inputs: algorithm (string) - Name of the regressor to run. Follows Sklearn naming conventions. Available keys: ARDRegression | AdaBoostRegressor | BaggingRegressor | BayesianRidge | CCA DecisionTreeRegressor | ElasticNet | ExtraTreeRegressor ExtraTreesRegressor | GaussianProcessRegressor | GradientBoostingRegressor HuberRegressor | KNeighborsRegressor | KernelRidge | Lars | Lasso LassoLars | LinearRegression | LinearSVR | MLPRegressor | NuSVR | OrthogonalMatchingPursuit | PLSCanonical | PLSRegression | PassiveAggressiveRegressor | RANSACRegressor | RandomForestRegressor | Ridge | SGDRegressor | SVR | TheilSenRegressor | TransformedTargetRegressor Currently not supporting: ElasticNetCV | LarsCV | LassoCV | LassoLarsCV | LassoLarsIC | MultiTaskElasticNet | MultiTaskElasticNetCV | MultiTaskLasso | MultiTaskLassoCV | OrthogonalMatchingPursuitCV | RidgeCV | RadiusNeighborsRegressor Outputs: Notes: Scoring Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter ''' if (self.algorithmName == "ARDRegression"): algorithm = ARDRegression() elif (self.algorithmName == "AdaBoostRegressor"): algorithm = AdaBoostRegressor() elif (self.algorithmName == "BaggingRegressor"): algorithm = BaggingRegressor() elif (self.algorithmName == "BayesianRidge"): algorithm = BayesianRidge() elif (self.algorithmName == "CCA"): algorithm = CCA() elif (self.algorithmName == "DecisionTreeRegressor"): algorithm = DecisionTreeRegressor() elif (self.algorithmName == "ElasticNet"): algorithm = ElasticNet() elif (self.algorithmName == "ExtraTreeRegressor"): algorithm = ExtraTreeRegressor() elif (self.algorithmName == "ExtraTreesRegressor"): algorithm = ExtraTreesRegressor() elif (self.algorithmName == "GaussianProcessRegressor"): algorithm = GaussianProcessRegressor() elif (self.algorithmName == "GradientBoostingRegressor"): algorithm = GradientBoostingRegressor() elif (self.algorithmName == "HuberRegressor"): algorithm = HuberRegressor() elif (self.algorithmName == "KNeighborsRegressor"): algorithm = KNeighborsRegressor() elif (self.algorithmName == "KernelRidge"): algorithm = KernelRidge() elif (self.algorithmName == "Lars"): algorithm = Lars() elif (self.algorithmName == "Lasso"): algorithm = Lasso() elif (self.algorithmName == "LassoLars"): algorithm = LassoLars() elif (self.algorithmName == "LinearRegression"): algorithm = LinearRegression() elif (self.algorithmName == "LinearSVR"): algorithm = LinearSVR() elif (self.algorithmName == "MLPRegressor"): algorithm = MLPRegressor() elif (self.algorithmName == "NuSVR"): algorithm = NuSVR() elif (self.algorithmName == "OrthogonalMatchingPursuit"): algorithm = OrthogonalMatchingPursuit() elif (self.algorithmName == "PLSCanonical"): algorithm = PLSCanonical() elif (self.algorithmName == "PLSRegression"): algorithm = PLSRegression() elif (self.algorithmName == "PassiveAggressiveRegressor"): algorithm = PassiveAggressiveRegressor() elif (self.algorithmName == "RANSACRegressor"): algorithm = RANSACRegressor() elif (self.algorithmName == "RandomForestRegressor"): algorithm = RandomForestRegressor() elif (self.algorithmName == "Ridge"): algorithm = Ridge() elif (self.algorithmName == "SGDRegressor"): algorithm = SGDRegressor() elif (self.algorithmName == "SVR"): algorithm = SVR() elif (self.algorithmName == "TheilSenRegressor"): algorithm = TheilSenRegressor() elif (self.algorithmName == "TransformedTargetRegressor"): algorithm = TransformedTargetRegressor() else: return None return algorithm
def get_model_from_name(model_name, training_params=None, is_hp_search=False): global keras_imported # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': { 'n_jobs': -2, 'n_estimators': 30 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': {}, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2, 'n_estimators': 30 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'learning_rate': 0.1, 'warm_start': True }, 'GradientBoostingClassifier': { 'learning_rate': 0.1, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': {}, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search == True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( ) model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor( calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier( calc_feature_importance=True) if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU, ThresholdedReLU, ELU global Sequential global keras_load_model global regularizers, optimizers global Activation global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Activation, Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers, optimizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
def fit_linreg_robust(x, y, mask=None, intercept=False, r2=True, est_method='rlm'): """Apply robust linear regression of y w.r.t x. Arguments --------- x: :class:`~numpy.ndarray` or sparse `csr_matrix` A vector of independent variables. y: :class:`~numpy.ndarray` or sparse `csr_matrix` A vector of dependent variables. intercept: bool If using steady state assumption for fitting, then: True -- the linear regression is performed with an unfixed intercept; False -- the linear regresssion is performed with a fixed zero intercept. est_method: str (default: `rlm`) The linear regression estimation method that will be used. Returns ------- k: float The estimated slope. b: float The estimated intercept. r2: float Coefficient of determination or r square calculated with the extreme data points. all_r2: float The r2 calculated using all data points. """ x = x.A if issparse(x) else x y = y.A if issparse(y) else y _mask = np.logical_and(~np.isnan(x), ~np.isnan(y)) if mask is not None: _mask &= mask xx = x[_mask] yy = y[_mask] try: if est_method.lower() == 'rlm': xx_ = sm.add_constant(xx) if intercept else xx res = sm.RLM(yy, xx_).fit() k, b = res.params[::-1] if intercept else (res.params[0], 0) elif est_method.lower() == 'ransac': reg = RANSACRegressor(LinearRegression(fit_intercept=intercept), random_state=0) reg.fit(xx.reshape(-1, 1), yy.reshape(-1, 1)) k, b = reg.estimator_.coef_[0, 0], (reg.estimator_.intercept_[0] if intercept else 0) else: raise ImportError( f"estimation method {est_method} is not implemented. " f"Currently supported linear regression methods include `rlm` and `ransac`." ) except: if intercept: ym = np.mean(yy) xm = np.mean(xx) cov = np.mean(xx * yy) - xm * ym var_x = np.mean(xx * xx) - xm * xm k = cov / var_x b = ym - k * xm # # assume b is always positive # if b < 0: # k, b = np.mean(xx * yy) / np.mean(xx * xx), 0 else: # use uncentered cov and var_x cov = np.mean(xx * yy) var_x = np.mean(xx * xx) k = cov / var_x b = 0 if r2: SS_tot_n, all_SS_tot_n = np.var(yy), np.var(y) SS_res_n, all_SS_res_n = ( np.mean((yy - k * xx - b)**2), np.mean((y - k * x - b)**2), ) r2, all_r2 = 1 - SS_res_n / SS_tot_n, 1 - all_SS_res_n / all_SS_tot_n return k, b, r2, all_r2 else: return k, b
def main(): regression_name=sys.argv[1] datapath=sys.argv[2] if(datapath=='housing.data.txt'): df = pd.read_csv('housing.data.txt', header=None, sep='\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] X=df.iloc[:,:-1] y=df['MEDV'].values else: df=pd.read_csv('all_breakdown.csv') df=df.fillna(0) X=df.iloc[:,1:-1] y=df['WIND TOTAL'].values y2d=y[ : , np . newaxis ] #change one dimensional array to two dimensions sc_x = StandardScaler ( ) sc_y = StandardScaler ( ) sc_x.fit (X) sc_y.fit (y) x_std = sc_x.transform(X) y_std = sc_y.transform(y2d).flatten() X_train, X_test, y_train, y_test = train_test_split(x_std, y_std, test_size=0.3, random_state=0) if (regression_name=="Linear"): model = LinearRegression () model.fit (X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) print('Linear Regression') print ( 'Slope : %.3f ' %model.coef_[0]) print ( 'Intercept : %.3f' %model.intercept_) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))); elif (regression_name=="RANSAC"): ransac = RANSACRegressor(LinearRegression() , max_trials=100, min_samples=50, loss='absolute_loss' , residual_threshold=5.0, random_state=1) ransac. fit (X_train,y_train) print('RANSAC Regressor') print ( 'Slope : %.3f ' %ransac.estimator_.coef_[0]) print ( 'Intercept : %.3f' %ransac.estimator_.intercept_) # print( 'Score of the prediction: %.3f' %ransac.score(X_test,y_test)) y_train_pred = ransac.predict(X_train) y_test_pred = ransac.predict(X_test) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))); elif (regression_name=="Ridge"): ridge = Ridge(alpha=1.0) ridge.fit(X_train, y_train) y_train_pred = ridge.predict(X_train) y_test_pred = ridge.predict(X_test) print('Ridge Regularization') print ('Slope : %.3f'%ridge.coef_[0]) print ('Intercept : %.3f' %ridge.intercept_) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))); elif (regression_name=="Lasso"): lasso = Lasso(alpha=1.0) lasso.fit(X_train, y_train) y_train_pred = lasso.predict(X_train) y_test_pred = lasso.predict(X_test) print('Lasso Regularization') print ( 'Slope : %.3f ' %lasso.coef_[0]) print ( 'Intercept : %.3f' %lasso.intercept_ ) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))); elif (regression_name=="Nonlinear"): tree = DecisionTreeRegressor(max_depth=3) tree. fit (X_train,y_train) y_test_pred = tree . predict (X_test) y_train_pred = tree.predict(X_train) print('Non linear Regression - Decision Tree Regressor') print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))); elif (regression_name=="Normal"): if(datapath=='housing.data.txt'): onevec = np.ones((X_train.shape[0]) ) #this generates a 1−dimensional array onevec = onevec [ : , np . newaxis ] # changes the 1−dimensional array to 2−dimensional array Xb = np.hstack((onevec,X_train)) # Xb is a 2−dimensional array w = np.zeros(X_train.shape[1]) z = np.linalg.inv(np.dot(Xb.T,Xb)) w = np.dot(z, np.dot(Xb.T,y_train)) print('Normal Equation Solution') print('Slope: %.3f' %w[1]) print ( 'Intercept : %.3f' %w[0]); yhat = np.dot(Xb,w.T) print('MSE train: %.3f,' %mean_squared_error(y_train, yhat)) else: print('Not Applicable'); else: print ("No regression available with the given name"); print("--- Time taken is %s seconds ---" % (time.time() - start_time))
def fit_RANSAC(features_train, labels_train, features_pred): model = RANSACRegressor() model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) print "RANSAC - coefficient of determination R^2 of the prediction: ", model.score(features_train, labels_train) return labels_pred
# from sklearn.linear_model import TheilSenRegressor X = np.reshape(star.temp.array,(-1,1)) y = star.light reg = TheilSenRegressor(random_state=0).fit(X,y) plt.scatter(star.temp, star.light) plt.plot(xr, reg.intercept_ + reg.coef_*xr,'k-') # from sklearn.linear_model import RANSACRegressor reg = RANSACRegressor().fit(X,y) i = reg.inlier_mask_ plt.scatter(star.temp[i], star.light[i]) plt.scatter(star.temp[~i], star.light[~i],marker='x') plt.plot(xr, reg.estimator_.intercept_ + reg.estimator_.coef_*xr, 'k-') # ## Exercises # ## Packages Used import sys import matplotlib import statsmodels as sm import seaborn as sns
def test_ransac_residual_metric(): residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1) residual_metric2 = lambda dy: np.sum(dy ** 2, axis=1) yyy = np.column_stack([y, y, y]) base_estimator = LinearRegression() ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, residual_metric=residual_metric1) ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, residual_metric=residual_metric2) # multi-dimensional ransac_estimator0.fit(X, yyy) ransac_estimator1.fit(X, yyy) ransac_estimator2.fit(X, yyy) assert_equal(ransac_estimator0.predict(X), ransac_estimator1.predict(X)) assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X)) # one-dimensional ransac_estimator0.fit(X, y) ransac_estimator2.fit(X, y) assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))
def get_ransac(self, x, y): # RANSAC регрессор ransac = RANSACRegressor(LinearRegression(), residual_threshold=5) ransac.fit(x, y) return ransac
ax.plot(xvals, xvals, 'k--') ax.set_xlim([-125, 60]) ax.set_ylim([-125, 60]) ax.set_aspect('equal') ax.text(0.5, 1.05, 'y = {0:0.3f}x + {1:0.3f}'.format(perform_dstBiot['Slope'], perform_dstBiot['Intercept']), horizontalalignment='center', transform=ax.transAxes) ax.set_xlabel('Sym-H (Observed) [nT]') ax.set_ylabel('Sym-H (Modeled) [nT]') if False: from sklearn.linear_model import TheilSenRegressor, RANSACRegressor #tsreg = TheilSenRegressor(random_state=77) reg = RANSACRegressor(random_state=77) reg.fit(kyotodata['sym-h'][:, np.newaxis], data['dstBiot'][:, np.newaxis]) y_pred = reg.predict(xvals[:, np.newaxis]) ax.plot(xvals, y_pred, 'm-', label='RANSAC') ax.legend() plt.tight_layout() plt.savefig('Jan2005_DstCompare_scatter_new.png') #now write metrics to log logging.info('=================') logging.info('===PERFORMANCE===') logging.info('=================') logging.info('N_points: {}\n'.format(len(data['dstBiot']))) if useBiot:
from sklearn import datasets california = datasets.california_housing.fetch_california_housing() X, y = california.data, california.target from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LinearRegression, RANSACRegressor from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.svm import SVR from sklearn.svm import LinearSVR regressors = [ LinearRegression(), RANSACRegressor(), KNeighborsRegressor(), KNeighborsRegressor(n_neighbors=9, metric='manhattan'), SVR(), LinearSVR(), SVR(kernel='linear' ), # Cf. LinearSVR: much slower, might be better or worse: GaussianProcessRegressor(), ] from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import r2_score from time import time for model in regressors:
def main(): parser = argparse.ArgumentParser( description= 'Large-scale Point Cloud Semantic Segmentation with Superpoint Graphs') parser.add_argument('--ROOT_PATH', default='datasets/airborne_lidar') parser.add_argument('--dataset', default='airborne_lidar') # parameters parser.add_argument( '--compute_geof', default=1, type=int, help='compute hand-crafted features of the local geometry') parser.add_argument( '--k_nn_local', default=20, type=int, help='number of neighbors to describe the local geometry') parser.add_argument('--k_nn_adj', default=5, type=int, help='number of neighbors for the adjacency graph') parser.add_argument('--voxel_width', default=0.03, type=float, help='voxel size when subsampling (in m)') parser.add_argument('--plane_model', default=1, type=int, help='uses a simple plane model to derive elevation') parser.add_argument( '--use_voronoi', default=0.0, type=float, help= 'uses the Voronoi graph in combination to knn to build the adjacency graph, ' 'useful for sparse aquisitions. If 0., do not use voronoi. ' 'If >0, then is the upper length limit for an edge to be kept. ') parser.add_argument('--ver_batch', default=5000000, type=int, help='batch size for reading large files') args = parser.parse_args() # path to data if args.ROOT_PATH[-1] == '/': root = args.ROOT_PATH else: root = args.ROOT_PATH + '/' if not os.path.exists(root + 'features_supervision'): os.makedirs(root + 'features_supervision') # list of subfolders to be processed if args.dataset == 'airborne_lidar': folders = ["trn/", "val/", "tst/"] n_labels = 4 else: raise ValueError('%s is an unknown data set' % args.dataset) pruning = args.voxel_width > 0 # ------------------------------------------------------------------------------ for folder in folders: print("=================\n " + folder + "\n=================") data_folder = root + folder output_folder = root + "features_supervision/" + folder if not os.path.isdir(data_folder): raise ValueError(f"{folder} does not exist") if not os.path.isdir(output_folder): os.mkdir(output_folder) if args.dataset == 'airborne_lidar': files = glob.glob(data_folder + "*.las") if len(files) == 0: raise ValueError(f"{folder} is empty") n_files = len(files) i_file = 0 for file in files: file_name = os.path.splitext(os.path.basename(file))[0] data_file = f"{data_folder}{file_name}.las" str_file = f"{output_folder}{file_name}.h5" i_file = i_file + 1 print(f"{i_file} / {n_files}---> {file_name}") if os.path.isfile(str_file): print( " graph structure already computed - delete for update..." ) else: # --- build the geometric feature file h5 file --- print(" computing graph structure...") # --- read the data files and compute the labels--- if args.dataset == 'airborne_lidar': xyz, nb_return, intensity, label = read_airborne_lidar_format( data_file) if args.dataset == 's3dis': xyz, rgb, labels, objects = read_s3dis_format(data_file) if pruning: n_objects = int(objects.max() + 1) xyz, rgb, labels, objects = libply_c.prune( xyz, args.voxel_width, rgb, labels, objects, n_labels, n_objects) # hard_labels = labels.argmax(axis=1) objects = objects[:, 1:].argmax(axis=1) + 1 else: # hard_labels = labels objects = objects elif args.dataset == 'sema3d': has_labels = (os.path.isfile(label_file)) if has_labels: xyz, rgb, labels = read_semantic3d_format( data_file, n_labels, label_file, args.voxel_width, args.ver_batch) else: xyz, rgb = read_semantic3d_format( data_file, 0, '', args.voxel_width, args.ver_batch) labels = np.array([0]) objects = np.array([0]) is_transition = np.array(False) elif args.dataset == 'vkitti': xyz, rgb, labels = read_vkitti_format(data_file) if pruning: xyz, rgb, labels, o = libply_c.prune( xyz.astype('f4'), args.voxel_width, rgb.astype('uint8'), labels.astype('uint8'), np.zeros(1, dtype='uint8'), n_labels, 0) # ---compute nn graph------- n_ver = xyz.shape[0] print("computing NN structure") graph_nn, local_neighbors = compute_graph_nn_2( xyz, args.k_nn_adj, args.k_nn_local, voronoi=args.use_voronoi) if args.dataset == 's3dis': is_transition = objects[graph_nn["source"]] != objects[ graph_nn["target"]] elif args.dataset == 'sema3d' and has_labels: # sema has no object, we make them ourselves with label inpainting hard_labels = np.argmax(labels[:, 1:], 1) + 1 no_labels = (labels[:, 1:].sum(1) == 0).nonzero() hard_labels[no_labels] = 0 is_transition = hard_labels[graph_nn["source"]] != hard_labels[graph_nn["target"]] * (hard_labels[graph_nn["source"]] != 0) \ * (hard_labels[graph_nn["target"]] != 0) edg_source = graph_nn["source"][( is_transition == 0).nonzero()].astype('uint32') edg_target = graph_nn["target"][( is_transition == 0).nonzero()].astype('uint32') edge_weight = np.ones_like(edg_source).astype('f4') node_weight = np.ones((n_ver, ), dtype='f4') node_weight[no_labels] = 0 print("Inpainting labels") dump, objects = libcp.cutpursuit2( np.array(hard_labels).reshape((n_ver, 1)).astype('f4'), edg_source, edg_target, edge_weight, node_weight, 0.01) is_transition = objects[graph_nn["source"]] != objects[ graph_nn["target"]] elif args.dataset == 'vkitti': # we define the objects as the constant connected components of the labels hard_labels = np.argmax(labels, 1) is_transition = hard_labels[ graph_nn["source"]] != hard_labels[graph_nn["target"]] dump, objects = libply_c.connected_comp( n_ver, graph_nn["source"].astype('uint32'), graph_nn["target"].astype('uint32'), (is_transition == 0).astype('uint8'), 0) if args.compute_geof: geof = libply_c.compute_geof( xyz, local_neighbors, args.k_nn_local).astype('float32') geof[:, 3] = 2. * geof[:, 3] else: geof = 0 if args.plane_model: # use a simple palne model to the compute elevation low_points = (xyz[:, 2] - xyz[:, 2].min() < 0.5).nonzero()[0] reg = RANSACRegressor(random_state=0).fit( xyz[low_points, :2], xyz[low_points, 2]) elevation = xyz[:, 2] - reg.predict(xyz[:, :2]) else: elevation = xyz[:, 2] - xyz[:, 2].min() # compute the xy normalized position ma, mi = np.max(xyz[:, :2], axis=0, keepdims=True), np.min(xyz[:, :2], axis=0, keepdims=True) xyn = (xyz[:, :2] - mi) / (ma - mi + 1e-8) # global position write_structure( str_file, xyz, rgb, graph_nn, local_neighbors.reshape([n_ver, args.k_nn_local]), is_transition, labels, objects, geof, elevation, xyn)
def fit(self, idle_engine_speed, on_engine, temperature_derivatives, temperatures, *args): """ Calibrates an engine temperature regression model to predict engine temperatures. This model returns the delta temperature function of temperature (previous), acceleration, and power at the wheel. :param idle_engine_speed: Engine speed idle median and std [RPM]. :type idle_engine_speed: (float, float) :param on_engine: If the engine is on [-]. :type on_engine: numpy.array :param temperature_derivatives: Derivative temperature vector [°C]. :type temperature_derivatives: numpy.array :param temperatures: Temperature vector [°C]. :type temperatures: numpy.array :return: The calibrated engine temperature regression model. :rtype: ThermalModel """ spl = _build_samples(temperature_derivatives, temperatures, *args) self.thermostat = self._identify_thermostat(spl, idle_engine_speed) spl = _filter_samples(spl, on_engine, self.thermostat) opt = { 'random_state': 0, 'max_depth': 2, 'n_estimators': int(min(300, 0.25 * (len(spl) - 1))), 'loss': 'huber', 'alpha': 0.99 } model = RANSACRegressor(base_estimator=self.base_model(**opt), random_state=0, min_samples=0.85, max_trials=10) model = Pipeline([('feature_selection', _SelectFromModel(model, '0.8*median', in_mask=(0, 2))), ('classification', model)]) model.fit(spl[:, :-1], spl[:, -1]) self.model = model.steps[-1][-1] self.mask = np.where(model.steps[0][-1]._get_support_mask())[0] self.min_temp = spl[:, 0].min() spl = spl[:co2_utl.argmax(self.thermostat <= spl[:, 0])] if not spl.any(): self.min_temp = -float('inf') return self spl = spl[:co2_utl.argmax(np.percentile(spl[:, 0], 60) <= spl[:, 0])] opt = { 'random_state': 0, 'max_depth': 2, 'n_estimators': int(min(300, 0.25 * (len(spl) - 1))), 'loss': 'huber', 'alpha': 0.99 } model = self.base_model(**opt) model = Pipeline([('feature_selection', _SelectFromModel(model, '0.8*median', in_mask=(1, ))), ('classification', model)]) model.fit(spl[:, 1:-1], spl[:, -1]) self.cold = model.steps[-1][-1] self.mask_cold = np.where( model.steps[0][-1]._get_support_mask())[0] + 1 return self
def test_ransac_min_n_samples(): base_estimator = LinearRegression() ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2. / X.shape[0], residual_threshold=5, random_state=0) ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=-1, residual_threshold=5, random_state=0) ransac_estimator4 = RANSACRegressor(base_estimator, min_samples=5.2, residual_threshold=5, random_state=0) ransac_estimator5 = RANSACRegressor(base_estimator, min_samples=2.0, residual_threshold=5, random_state=0) ransac_estimator6 = RANSACRegressor(base_estimator, residual_threshold=5, random_state=0) ransac_estimator7 = RANSACRegressor(base_estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0) ransac_estimator1.fit(X, y) ransac_estimator2.fit(X, y) ransac_estimator5.fit(X, y) ransac_estimator6.fit(X, y) assert_array_almost_equal(ransac_estimator1.predict(X), ransac_estimator2.predict(X)) assert_array_almost_equal(ransac_estimator1.predict(X), ransac_estimator5.predict(X)) assert_array_almost_equal(ransac_estimator1.predict(X), ransac_estimator6.predict(X)) assert_raises(ValueError, ransac_estimator3.fit, X, y) assert_raises(ValueError, ransac_estimator4.fit, X, y) assert_raises(ValueError, ransac_estimator7.fit, X, y)
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train:\n{}\n'.format(sys.argv[1])) print('test:\n{}\n'.format(sys.argv[2])) if 'small' in sys.argv[1]: size = 'small' elif 'medium' in sys.argv[1]: size = 'medium' else: size = 'large' x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) # type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') # if type == 1: parameter = None method = input('select a method: {}: '.format(methods)) if method == 1: classifier = input('select a classifier: {}: '.format(classifiers)) if classifier == 1: parameter = input('criterion: [1: gini, 2: entropy] ') if parameter == 1: model = DecisionTreeClassifier(criterion='gini') parameter = 'gini' elif parameter == 2: model = DecisionTreeClassifier(criterion='entropy') parameter = 'entropy' else: print('no criterion chosen') sys.exit() elif classifier == 2: model = ExtraTreeClassifier() elif classifier == 3: model = ExtraTreesClassifier() elif classifier == 4: parameter = input('n: [1: 1, 2: 3: 3: 5] ') if parameter == 1: model = KNeighborsClassifier(n_neighbors=1) parameter = '1' elif parameter == 2: model = KNeighborsClassifier(n_neighbors=3) parameter = '3' elif parameter == 3: model = KNeighborsClassifier(n_neighbors=5) parameter = '5' else: print('no n chosen') sys.exit() elif classifier == 5: parameter = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if parameter == 1: model = GaussianNB() parameter = 'gaussian' elif parameter == 2: model = BernoulliNB() parameter = 'bernoulli' elif parameter == 3: model = MultinomialNB() parameter = 'multinomial' elif parameter == 4: model = ComplementNB() parameter = 'complement' else: print('no version chosen') sys.exit() elif classifier == 6: model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: model = LinearSVC(multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: model = GradientBoostingClassifier() elif classifier == 10: model = GaussianProcessClassifier(multi_class='one_vs_one') elif classifier == 11: model = SGDClassifier() elif classifier == 12: model = PassiveAggressiveClassifier() elif classifier == 13: model = NearestCentroid() elif classifier == 14: model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: model = MLPClassifier() elif classifier == 16: model = AdaBoostClassifier(n_estimators=50) elif classifier == 17: parameter = input( 'strategy: [1: stratified, 2: most frequent, 3: prior, 4: uniform, 5: constant] ' ) if parameter == 1: model = DummyClassifier(strategy='stratified') parameter = 'stratified' elif parameter == 2: model = DummyClassifier(strategy='most_frequent') parameter = 'most frequent' elif parameter == 3: model = DummyClassifier(strategy='prior') parameter = 'prior' elif parameter == 4: model = DummyClassifier(strategy='uniform') parameter = 'uniform' elif parameter == 5: model = DummyClassifier(strategy='constant') parameter = 'constant' else: print('no strategy selected') sys.exit() else: print('no classifier chosen') sys.exit() import time # Starts timer start = time.clock() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) report = classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other']) confusion = confusion_matrix( y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"]) if (parameter != None): filename = '{},{},{},{}.txt'.format(size, methods[method], classifiers[classifier], parameter) else: filename = '{},{},{}.txt'.format(size, methods[method], classifiers[classifier]) # Prints the time taken end = time.clock() time = str(end - start) with open(filename, 'w') as output: output.write('method:\n{}\n\n'.format(methods[method])) output.write('classifier:\n{}\n\n'.format(classifiers[classifier])) output.write('accuracy:\n{:.2f}%\n\n'.format( 100 * accuracy_score(y_test, predictions))) output.write('report:\n{}\n\n'.format(report)) output.write('confusion:\n{}\n\n'.format(confusion)) output.write('time:\n{}s\n\n'.format(time)) output.write('data:\n{:10}\t{:10}\t{:10}\n'.format( 'actual', 'predict', 'match?')) for i in range(len(predictions)): output.write('{:10}\t{:10}\t{:10}\n'.format( y_train[i], predictions[i], y_test[i] == predictions[i])) print('\nmethod:\n{}\n'.format(methods[method])) print('classifier:\n{}\n'.format(classifiers[classifier])) print('accuracy:\n{:.2f}%\n'.format( 100 * accuracy_score(y_test, predictions))) print('report:\n{}\n'.format(report)) print('confusion:\n{}\n'.format(confusion)) print('time: {}s\n'.format(time)) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers le = LabelEncoder() le.fit( y_train ) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) print(le.classes_) y_train = le.transform(y_train) y_test = le.transform(y_test) regressor = input('select a regressor: {}: '.format(regressors)) if regressor == 1: print(method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(method, regressor) model = LogisticRegression(solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(method, regressor) model = RidgeClassifier() elif regressor == 4: print(method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: model = OneVsRestClassifier(LinearRegression()) elif regressor == 6: model = OneVsRestClassifier(DecisionTreeRegressor()) elif regressor == 7: print(method, regressor) model = OneVsRestClassifier(Lasso(alpha=0.1)) elif regressor == 8: print(method, regressor) model = OneVsRestClassifier(MultiTaskLasso(alpha=0.1)) elif regressor == 9: print(method, regressor) model = OneVsRestClassifier(ElasticNet(random_state=0)) elif regressor == 10: print(method, regressor) model = OneVsRestClassifier(MultiTaskElasticNet(random_state=0)) elif regressor == 11: print(method, regressor) model = OneVsRestClassifier(Lars(n_nonzero_coefs=1)) elif regressor == 12: print(method, regressor) model = OneVsRestClassifier(LassoLars(alpha=.1)) elif regressor == 13: print(method, regressor) model = OneVsRestClassifier(OrthogonalMatchingPursuit()) elif regressor == 14: print(method, regressor) model = OneVsRestClassifier(BayesianRidge()) elif regressor == 15: print(method, regressor) model = OneVsRestClassifier(ARDRegression()) elif regressor == 16: print(method, regressor) model = OneVsRestClassifier(TheilSenRegressor(random_state=0)) elif regressor == 17: print(method, regressor) model = OneVsRestClassifier(HuberRegressor()) elif regressor == 18: print(method, regressor) model = OneVsRestClassifier(RANSACRegressor(random_state=0)) else: print('no regressor chosen') sys.exit() import time # Starts timer start = time.clock() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # y_train = le.inverse_transform(y_train) # y_test = le.inverse_transform(y_test) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) if (parameter != None): filename = '{},{},{},{}.txt'.format(size, methods[method], regressors[regressor], parameter) else: filename = '{},{},{}.txt'.format(size, methods[method], regressors[regressor]) # Prints the time taken end = time.clock() time = str(end - start) with open(filename, 'w') as output: output.write('method:\n{}\n\n'.format(methods[method])) output.write('regressor:\n{}\n\n'.format(regressors[regressor])) output.write('accuracy:\n{:.2f}%\n\n'.format( 100 * accuracy_score(y_test, predictions))) output.write('time:\n{}s\n\n'.format(time)) output.write('data:\n{:10}\t{:10}\t{:10}\n'.format( 'actual', 'predict', 'match?')) for i in range(len(predictions)): output.write('{:10}\t{:10}\t{:10}\n'.format( y_train[i], predictions[i], y_test[i] == predictions[i])) print('\nmethod:\n{}\n'.format(methods[method])) print('regressor:\n{}\n'.format(regressors[regressor])) print('accuracy:\n{:.2f}%\n'.format( 100 * accuracy_score(y_test, predictions))) print('time: {}s\n'.format(time)) else: print('no method chosen') sys.exit()
# No. of samples: 506 # No. of explanatory variables: 13 df = pd.read_csv('housing.data.txt', header=None, sep='\s+') df.columns = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] #print(df.head()) X = df[['RM']].values y = df['MEDV'].values ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, loss='absolute_loss', residual_threshold=5.0, random_state=0) ransac.fit(X, y) # plotting inliers and outliers together with the linear fit inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='steelblue', edgecolor='white', marker='o', label='Inliers')
def run(self, trainingDasaset, plotting): dataset = trainingDasaset accuracy = 0 y = dataset['int_rate'] X = dataset.drop(columns=[ 'int_rate', ]) if plotting == True: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) clf = RANSACRegressor(random_state=42) #clf=self.gridSearch(clf,X_train, y_train) clf.fit(X_train, y_train) print( "###################################RANSACRegressor#############################" ) accuracy = clf.score(X_test, y_test) #pred = clf.predict(X_test) #accuracy = np.sqrt(metrics.mean_squared_error(y_test,pred)) print("score:" + str(accuracy)) else: clf = RANSACRegressor(random_state=42) #clf=self.gridSearch(clf,X_train, y_train) clf.fit(X, y) testData = pd.read_csv( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/SiameseTrainingData.csv" ) predictions = clf.predict(testData) np.savetxt( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/RANSACRegressorPredictions.csv", predictions, delimiter=",") testData = pd.read_csv( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/OverallTestingData.csv" ) predictions = clf.predict(testData) np.savetxt( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/RANSACRegressorPredictionsTestData.csv", predictions, delimiter=",") return accuracy
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import classification_report, confusion_matrix #loading the dataset train = pd.read_csv("C:/Users/HP/Desktop/train (1).csv") test = pd.read_csv("C:/Users/HP/Desktop/test (2).csv") train = train.dropna() test = test.dropna() train.head() X_train = np.array(train.iloc[:, :-1].values) y_train = np.array(train.iloc[:, 1].values) X_test = np.array(test.iloc[:, :-1].values) y_test = np.array(test.iloc[:, 1].values) #RANSAC Regressor from sklearn.linear_model import RANSACRegressor model = RANSACRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = model.score(X_test, y_test) plt.plot(X_train, model.predict(X_train), color='r') plt.show() print(accuracy) print(accuracy)
def get_base_model(): return {'ransac_regressor': RANSACRegressor()}
import cv2 from sklearn.linear_model import ( LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor) from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline np.random.seed(42) poly = PolynomialFeatures(2) reg = LinearRegression(PolynomialFeatures(2)) ransac_estimator = RANSACRegressor(reg, min_samples=3,random_state=42) RANSACRegressor() regressor = HuberRegressor() model = make_pipeline(poly, regressor) x=np.linspace(0, 5, 200) y=x*x model.fit(x.reshape(-1, 1), y) ransac_estimator.fit(x.reshape(-1, 1), y) #egressor.warm_start = True #egressor.fit(x.reshape(-1, 1),y) #rint('oi') x0 = np.zeros(3) def fun(x,t, y): return x[2] + t*x[1]+ t*t*x[0] -y
# Kernel ridge. ('KernelRidge', lambda: KernelRidge()), # Linear. # Way too slow. #('ARDRegression', lambda: ARDRegression()), ('HuberRegressor', lambda: HuberRegressor()), ('LinearRegression', lambda: LinearRegression()), # ValueError: Unknown label type: 'continuous' #('LogisticRegression', lambda: LogisticRegression()), # ValueError: Unknown label type: 'continuous' #('LogisticRegressionCV', lambda: LogisticRegressionCV()), ('PassiveAggressiveRegressor', lambda: PassiveAggressiveRegressor()), # ValueError: Unknown label type: 'continuous' #('RandomizedLogisticRegression', lambda: RandomizedLogisticRegression()), ('RANSACRegressor', lambda: RANSACRegressor()), ('SGDRegressor', lambda: SGDRegressor()), # Way too slow. #('TheilSenRegressor', lambda: TheilSenRegressor()), # Neighbors. ('KNeighborsRegressor', lambda: KNeighborsRegressor()), # Predicts Nan, infinity or too large of value. #('RadiusNeighborsRegressor', lambda: RadiusNeighborsRegressor()), # Neural network. # Increase max_iter to avoid Warning about non-convergence within max_iter. ('MLPRegressor', lambda: MLPRegressor(max_iter=1000)), # Support vector machine. ('SVR', lambda: SVR()),
def get_model_obj(modelType, n_clusters=None, **kwargs): if modelType == 'knn': from sklearn.neighbors import KNeighborsClassifier # 6 seems to give the best trade-off between accuracy and precision knn = KNeighborsClassifier(n_neighbors=6, **kwargs) return knn elif modelType == 'gaussianNB': from sklearn.naive_bayes import GaussianNB gnb = GaussianNB(**kwargs) return gnb elif modelType == 'multinomialNB': from sklearn.naive_bayes import MultinomialNB # TODO: figure out how to configure binomial distribution mnb = MultinomialNB(**kwargs) return mnb elif modelType == 'bernoulliNB': from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB(**kwargs) return bnb elif modelType == 'randomForest': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(random_state=234, **kwargs) return rfc elif modelType == 'svm': from sklearn.svm import SVC svc = SVC(random_state=0, probability=True, **kwargs) return svc elif modelType == 'LinearRegression': #assert column, "Column name required for building a linear model" #assert dataframe[column].shape == target.shape from sklearn import linear_model l_reg = linear_model.LinearRegression(**kwargs) return l_reg elif modelType == 'RidgeRegression': from sklearn.linear_model import Ridge if not kwargs: kwargs = {'alpha': 0.5} ridge_reg = Ridge(**kwargs) return ridge_reg elif modelType == 'RidgeRegressionCV': from sklearn import linear_model if not kwargs: kwargs = {'alphas': [0.1, 1.0, 10.0]} ridge_cv_reg = linear_model.RidgeCV(**kwargs) return ridge_cv_reg elif modelType == 'LassoRegression': from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1} lasso_reg = linear_model.Lasso(**kwargs) return lasso_reg elif modelType == 'ElasticNetRegression': from sklearn.metrics import r2_score from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1, 'l1_ratio': 0.7} enet_reg = linear_model.ElasticNet(**kwargs) return enet_reg elif modelType == 'LogisticRegression': from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(random_state=123, **kwargs) return log_reg elif modelType == 'RANSACRegression': from sklearn.linear_model import LinearRegression, RANSACRegressor ransac_model = RANSACRegressor(LinearRegression()) return ransac_model elif modelType == 'kde': from sklearn.neighbors.kde import KernelDensity kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs) return kde elif modelType == 'AR': import statsmodels.api as sm # fit an AR model and forecast ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9, method='mle', disp=-1, **kwargs) #ts_forecast = ar_fitted.predict(start='2008', end='2050') return ar_fitted elif modelType == 'SARIMAX': mod = sm.tsa.statespace.SARIMAX(df.riders, trend='n', order=(0, 1, 0), seasonal_order=(1, 1, 1, 12), **kwargs) return mod elif modelType == 'sgd': # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(**kwargs) return sgd elif modelType == 'perceptron': from sklearn.linear_model import Perceptron perceptron = Perceptron(**kwargs) return perceptron elif modelType == 'xgboost': import xgboost as xgb xgbm = xgb.XGBClassifier(**kwargs) return xgbm elif modelType == 'baseNN': from keras.models import Sequential from keras.layers import Dense # create model model = Sequential() assert args.get('inputParams', None) assert args.get('outputParams', None) model.add(Dense(inputParams)) model.add(Dense(outputParams)) if args.get('compileParams'): # Compile model model.compile( compileParams ) # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model elif modelType == 'lightGBMRegression': from pylightgbm.models import GBMRegressor lgbm_lreg = GBMRegressor(num_iterations=100, early_stopping_round=10, num_leaves=10, min_data_in_leaf=10) return lgbm_lreg elif modelType == 'lightGBMBinaryClass': from pylightgbm.models import GBMClassifier lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1) return lgbm_bc # Clustering models elif modelType == 'KMeans': assert n_clusters, "Number of clusters argument mandatory" cluster_callable = KMeans # seed of 10 for reproducibility. clusterer = cluster_callable(n_clusters=n_clusters, random_state=10) return clusterer elif modelType == 'dbscan': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) cluster_callable = DBSCAN clusterer = cluster_callable(eps=0.5) return clusterer elif modelType == 'affinity_prop': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = AffinityPropagation(damping=.9, preference=-200) return clusterer elif modelType == 'spectral': assert n_clusters, "Number of clusters argument mandatory" clusterer = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") return clusterer elif modelType == 'birch': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = Birch(n_clusters=2) return clusterer elif modelType == 'agglomerativeCluster': # connectivity matrix for structured Ward connectivity = kneighbors_graph(dataframe, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) clusterer = AgglomerativeClustering(n_clusters=cluster, linkage='ward', connectivity=connectivity) return clusterer elif modelType == 'meanShift': # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3) clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) return clusterer elif modelType == 'gmm': from sklearn import mixture gmm = mixture.GaussianMixture(n_components=5, covariance_type='full') return gmm elif modelType == 'dgmm': from sklearn import mixture dgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full') return dgmm else: raise 'Unknown model type: see utils.py for available'
#enr_sts_scores = enr.predict(xt[:, np.newaxis]) enr.fit(x, y) enr_sts_scores = enr.predict(xt) # Passive Aggressive Regression print 'passive aggressive' par = PassiveAggressiveRegressor() par.fit(x, y) par_sts_scores = par.predict(xt) #par.fit(x[:, np.newaxis], y) #par_sts_scores = par.predict(xt[:, np.newaxis]) # RANSAC Regression print 'ransac' ransac = RANSACRegressor() #ransac.fit(x[:, np.newaxis], y) #ransac_sts_scores = ransac.predict(xt[:, np.newaxis]) ransac.fit(x, y) ransac_sts_scores = ransac.predict(xt) # Logistic Regression print 'logistic' lgr = LogisticRegression() #lgr.fit(x[:, np.newaxis], y) #lgr_sts_scores = lgr.predict(xt[:, np.newaxis]) lgr.fit(x, y) lgr_sts_scores = lgr.predict(xt)
def test_ransac_min_n_samples(): base_estimator = LinearRegression() ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2. / X.shape[0], residual_threshold=5, random_state=0) ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=-1, residual_threshold=5, random_state=0) ransac_estimator4 = RANSACRegressor(base_estimator, min_samples=5.2, residual_threshold=5, random_state=0) ransac_estimator5 = RANSACRegressor(base_estimator, min_samples=2.0, residual_threshold=5, random_state=0) ransac_estimator6 = RANSACRegressor(base_estimator, residual_threshold=5, random_state=0) ransac_estimator7 = RANSACRegressor(base_estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0) ransac_estimator1.fit(X, y) ransac_estimator2.fit(X, y) ransac_estimator5.fit(X, y) ransac_estimator6.fit(X, y) assert_equal(ransac_estimator1.predict(X), ransac_estimator2.predict(X)) assert_equal(ransac_estimator1.predict(X), ransac_estimator5.predict(X)) assert_equal(ransac_estimator1.predict(X), ransac_estimator6.predict(X)) assert_raises(ValueError, ransac_estimator3.fit, X, y) assert_raises(ValueError, ransac_estimator4.fit, X, y) assert_raises(ValueError, ransac_estimator7.fit, X, y)
m = 'meteor' if m == 'asiya': x = np.loadtxt('x.asiya.train') y = np.loadtxt('y.asiya.train') elif m == 'meteor': x = np.loadtxt('x.meteor.train')[:,np.newaxis] y = np.loadtxt('y.meteor.train') x_test = np.loadtxt('x.meteor.test')[:,np.newaxis] regressors = {'lr':LinearRegression(), 'br':BayesianRidge(compute_score=True), 'enr':ElasticNet(), 'par':PassiveAggressiveRegressor(), 'ransac':RANSACRegressor(), 'lgr':LogisticRegression(), 'svr_rbf':SVR(kernel='rbf', C=1e3, gamma=0.1)} #'svr_lin':SVR(kernel='linear', C=1e3)} #'svr_poly':SVR(kernel='poly', C=1e3, degree=2)} def build_regressors(num): rgs = regressors[num] rgs.fit(x, y) with open(num+'.'+m+'.pk', 'wb') as fid: pickle.dump(rgs, fid) ''' x = x_test lr = pickle.load(open("lr."+m+'.pk', 'rb')) br = pickle.load(open("br."+m+'.pk', 'rb'))
class RansacClass: """ Name : RANSACRegressor Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'ransac' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = RANSACRegressor() # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename( self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
def test_ransac_residual_metric(): residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1) residual_metric2 = lambda dy: np.sum(dy**2, axis=1) yyy = np.column_stack([y, y, y]) base_estimator = LinearRegression() ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, residual_metric=residual_metric1) ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, residual_metric=residual_metric2) # multi-dimensional ransac_estimator0.fit(X, yyy) ransac_estimator1.fit(X, yyy) ransac_estimator2.fit(X, yyy) assert_equal(ransac_estimator0.predict(X), ransac_estimator1.predict(X)) assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X)) # one-dimensional ransac_estimator0.fit(X, y) ransac_estimator2.fit(X, y) assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))
z = np.linalg.inv(np.dot(Xb.T, Xb)) w = np.dot(z, np.dot(Xb.T, y)) print('Slope: %.3f' % w[1]) print('Intercept: %.3f' % w[0]) # # Fitting a robust regression model using RANSAC ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, loss='absolute_loss', residual_threshold=5.0, random_state=0) ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='steelblue', edgecolor='white', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask],
def get_model_from_name(model_name, training_params=None): # For Keras epochs = 250 if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': print( 'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy' ) epochs = 30 all_model_params = { 'LogisticRegression': { 'n_jobs': -2 }, 'RandomForestClassifier': { 'n_jobs': -2 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': { 'n_estimators': 10 }, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2 }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': { 'n_estimators': 10 }, 'XGBRegressor': { 'nthread': -1, 'n_estimators': 200 }, 'XGBClassifier': { 'nthread': -1, 'n_estimators': 200 }, 'LGBMRegressor': {}, 'LGBMClassifier': {}, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 } } model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'SGDClassifier': SGDClassifier(), 'Perceptron': Perceptron(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), 'SGDRegressor': SGDRegressor(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans() } if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if keras_installed: model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) model_with_params = model_without_params.set_params(**model_params) return model_with_params