def MODWT_MARS_TRAIN(series, regressors=4, delay=1, N=2000): series = series[len(series)-N:] series = np.array(series) series = series.reshape(-1, 1) D = regressors # number of regressors T = delay # delay N = N series = series[500:] data = np.zeros((N - 500 - T - (D - 1) * T, D)) lbls = np.zeros((N - 500 - T - (D - 1) * T,)) for t in range((D - 1) * T, N - 500 - T): data[t - (D - 1) * T, :] = [series[t - 3 * T], series[t - 2 * T], series[t - T], series[t]] lbls[t - (D - 1) * T] = series[t + T] trnData = data[:lbls.size - round(lbls.size * 0.3), :] trnLbls = lbls[:lbls.size - round(lbls.size * 0.3)] mars = Earth() mars.fit(trnData, trnLbls) boosted_mars = AdaBoostRegressor(base_estimator=mars, n_estimators=25, learning_rate=0.01, loss='exponential') boosted_mars.fit(trnData, trnLbls) preds = boosted_mars.predict(trnData) return preds
def test_pathological_cases(): import pandas directory = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'pathological_data') cases = {'issue_44': {}, 'issue_50': {'penalty': 0.5, 'minspan': 1, 'allow_linear': False, 'endspan': 1, 'check_every': 1, 'sample_weight': 'issue_50_weight.csv'}} for case, settings in cases.iteritems(): data = pandas.read_csv(os.path.join(directory, case + '.csv')) y = data['y'] del data['y'] X = data if 'sample_weight' in settings: filename = os.path.join(directory, settings['sample_weight']) sample_weight = pandas.read_csv(filename)['sample_weight'] del settings['sample_weight'] else: sample_weight = None model = Earth(**settings) model.fit(X, y, sample_weight=sample_weight) with open(os.path.join(directory, case + '.txt'), 'r') as infile: correct = infile.read() assert_equal(model.summary(), correct)
def fit_mars(self, X_test): reg = Earth(max_terms=1000, max_degree=1, penalty=3) reg.fit(self.X.copy().values, self.y.copy().values.flatten()) preds = reg.predict(X_test.copy().values) ids = X_test.index pred_df = pd.DataFrame(data=preds, index=ids, columns=['SalePrice']) pred_df.to_csv('results/results_mars.csv', sep=',')
def test_pathological_cases(): import pandas directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'pathological_data') cases = { 'issue_44': {}, 'issue_50': { 'penalty': 0.5, 'minspan': 1, 'allow_linear': False, 'endspan': 1, 'check_every': 1, 'sample_weight': 'issue_50_weight.csv' } } for case, settings in cases.iteritems(): data = pandas.read_csv(os.path.join(directory, case + '.csv')) y = data['y'] del data['y'] X = data if 'sample_weight' in settings: filename = os.path.join(directory, settings['sample_weight']) sample_weight = pandas.read_csv(filename)['sample_weight'] del settings['sample_weight'] else: sample_weight = None model = Earth(**settings) model.fit(X, y, sample_weight=sample_weight) with open(os.path.join(directory, case + '.txt'), 'r') as infile: correct = infile.read() assert_equal(model.summary(), correct)
class MARS: def __init__(self, x_train, y_train, x_test, y_test): self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.classifier = None def fit(self): self.classifier = Earth() self.classifier.fit(self.x_train, self.y_train) def predict(self): return self.classifier.predict(self.x_test) def dichotomize(self, predictions): median = np.median(predictions) res = np.array([1 if y >= median else -1 for y in predictions]) return res def evaluate(self): predictions = self.dichotomize(self.predict()) # print(predictions) error = 0.0 for y, correct in zip(predictions, self.y_test): if y != correct: error += 1 return error / len(self.y_test)
def marsAccuracy(self): #setting index as date values self.df.index = self.df['Date'] self.train = self.df[:200] self.valid = self.df[200:] #Split data: x_train = self.train.drop('Close', axis=1) y_train = self.train['Close'] x_valid = self.valid.drop('Close', axis=1) y_valid = self.valid['Close'] x_train = timeToFloat(x_train) x_valid = timeToFloat(x_valid) # define the model model = Earth() # fit the model on training dataset model.fit(x_train, y_train) self.preds = model.predict(x_valid) #Result #rmse rmse = np.sqrt(mean_squared_error(y_valid, self.preds)) return rmse
def estimate_reward(self, z_train, y_train, z): rcond = None mars_model = Earth(max_degree=2) mars_model.fit(z_train, y_train) reward = mars_model.predict([z]) # print("params: ", mars_model.coef_) return reward
def model_based_divergence(X, y, model_2): model_1 = Earth(feature_importance_type='gcv') model_1.fit(X, y) features_l = model_1.feature_importances_ features_else = model_2.feature_importances_ a_ = np.linalg.norm(features_l) b_ = np.linalg.norm(features_else) return np.dot(features_l, features_else) / (a_ * b_)
def marsFit(x,y): model = Earth(max_degree=1) model.fit(x,y) def f(x): return model.predict(x) return model.predict(x), model, range(len(x)), f
def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.trace()) + '\n' + model.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def run_pyearth(X, y, **kwargs): '''Run with pyearth. Return prediction value, training time, and number of forward pass iterations.''' model = Earth(**kwargs) t0 = time.time() model.fit(X, y) t1 = time.time() y_pred = model.predict(X) forward_iterations = len(model.forward_trace()) - 1 return y_pred, t1 - t0, forward_iterations
def HHT_MARS_TEST(series, regressors=4, delay=1, N=2000): series = series[len(series) - 2000:] series = np.array(series) series = series.reshape(-1, 1) D = regressors # number of regressors T = delay # delay N = N series = series[500:] data = np.zeros((N - 500 - T - (D - 1) * T, D)) lbls = np.zeros((N - 500 - T - (D - 1) * T, )) for t in range((D - 1) * T, N - 500 - T): data[t - (D - 1) * T, :] = [ series[t - 3 * T], series[t - 2 * T], series[t - T], series[t] ] lbls[t - (D - 1) * T] = series[t + T] trnData = data[:lbls.size - round(lbls.size * 0.3), :] trnLbls = lbls[:lbls.size - round(lbls.size * 0.3)] chkData = data[lbls.size - round(lbls.size * 0.3):, :] chkLbls = lbls[lbls.size - round(lbls.size * 0.3):] aa = np.array(chkLbls[-4:]).reshape(1, -1) chkData = np.append(chkData, aa, axis=0) mars = Earth() mars.fit(trnData, trnLbls) boosted_mars = AdaBoostRegressor(base_estimator=mars, n_estimators=25, learning_rate=0.1, loss='exponential') bag = BaggingRegressor(base_estimator=mars, n_estimators=25) bag.fit(trnData, trnLbls) boosted_mars.fit(trnData, trnLbls) pred2 = bag.predict(chkData) oos_preds = boosted_mars.predict(chkData) stack_predict = np.vstack([oos_preds, pred2]).T params_xgd = { 'max_depth': 7, 'objective': 'reg:linear', 'learning_rate': 0.05, 'n_estimators': 10000 } clf = xgb.XGBRegressor(**params_xgd) clf.fit(stack_predict[:-1, :], chkLbls, eval_set=[(stack_predict[:-1, :], chkLbls)], eval_metric='rmse', early_stopping_rounds=20, verbose=False) xgb_pred = clf.predict(stack_predict) return xgb_pred
def test_exhaustive_search(): model = Earth(max_terms=13, enable_pruning=False, check_every=1, thresh=0, minspan=1, endspan=1) model.fit(X, y) assert_equal(model.basis_.plen(), model.coef_.shape[1]) assert_equal(model.transform(X).shape[1], len(model.basis_))
def test_xlabels(): model = Earth(**default_params) assert_raises(ValueError, model.fit, X[:, 0:5], y, xlabels=['var1', 'var2']) model = Earth(**default_params) model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3']) model = Earth(**default_params) model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3'])
def test_nb_terms(): for max_terms in (1, 3, 12, 13): model = Earth(max_terms=max_terms) model.fit(X, y) assert_true(len(model.basis_) <= max_terms + 2) assert_true(len(model.coef_) <= len(model.basis_)) assert_true(len(model.coef_) >= 1) if max_terms == 1: assert_list_almost_equal_value(model.predict(X), y.mean())
def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .01)
def test_nb_terms(): for max_terms in (1, 3, 12, 13): model = Earth(max_terms=max_terms) model.fit(X, y) assert_true(len(model.basis_) <= max_terms) assert_true(len(model.coef_) <= len(model.basis_)) assert_true(len(model.coef_) >= 1) if max_terms == 1: assert_list_almost_equal_value(model.predict(X), y.mean())
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .01)
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def MARS(self, X=None, Y=None): """This function is used to imeplement Multivariate Adadptive Regression Splines """ from pyearth import Earth rgr = Earth() if (X is not None and Y is not None): (self.sampled_X, self.sampled_Y) = (X, Y) # train rgr.fit(self.sampled_X, self.sampled_Y) rgr.fit(self.sampled_X, self.sampled_Y) filename = './Model/ModelTransfer/MARS_' + self.mode + '.sav' pickle.dump(rgr, open(filename, 'wb'))
def test_fit(): numpy.random.seed(0) earth = Earth(**default_params) earth.fit(X, y) res = str(earth.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') if regenerate_target_files: with open(filename, 'w') as fl: fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def test_linvars(): earth = Earth(**default_params) earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_linvars_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def runModel(i,featureCombo): mae = np.array([]) logging.warning('try alpha = %s' % i) for ktrain,ktest in kf: x = trainCleaned.iloc[ktrain,] y = trainCleaned.iloc[ktest,] model = Earth() model.fit(x[featureCombo],x['Expected']) pred = model.predict(y[featureCombo]) mae = np.append(mae,(getMAE(pred,y['Expected']))) logging.warning('average 10-fold MAE for alpha %s feature %s' % (i,featureCombo)) logging.warning(mae.mean())
def test_fast(): earth = Earth(max_terms=10, max_degree=5, **default_params) earth.fit(X, y) normal_summary = earth.summary() earth = Earth(use_fast=True, max_terms=10, max_degree=5, fast_K=10, fast_h=1, **default_params) earth.fit(X, y) fast_summary = earth.summary() assert_equal(normal_summary, fast_summary)
def test_smooth(): numpy.random.seed(0) model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') if regenerate_target_files: with open(filename, 'w') as fl: fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def test_nb_degrees(): for max_degree in (1, 2, 12, 13): model = Earth(max_terms=10, max_degree=max_degree, enable_pruning=False, check_every=1, thresh=0, minspan=1, endspan=1) model.fit(X, y) for basis in model.basis_: assert_true(basis.degree() >= 0) assert_true(basis.degree() <= max_degree)
def test_linvars(): earth = Earth(**default_params) earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) res = str(earth.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_linvars_regress.txt') if regenerate_target_files: with open(filename, 'w') as fl: fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def mars(p, xLabels, yLabel): global image_num criteria = ('rss', 'gcv', 'nb_subsets') # Randomly shuffle rows p = p.sample(frac=1).reset_index(drop=True) # Split train and test twentyPercent = -1 * round(p.shape[0] * 0.2) n = len(xLabels) xCol = p[xLabels].values.reshape(-1, n) X_train = xCol[:twentyPercent] X_test = xCol[twentyPercent:] y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1) y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1) # Fit MARS model model = Earth(feature_importance_type=criteria) model.fit(X_train, y_train) # Make predictions predicted = model.predict(X_test) r2 = r2_score(y_test, predicted) mse = mean_squared_error(y_test, predicted) predicted = predicted.reshape(-1, 1) # Plot residuals plotResiduals(y_test, predicted) # Print summary print(model.trace()) print(model.summary()) # Plot feature importances importances = model.feature_importances_ for crit in criteria: x = list(range(0, len(xLabels))) sorted_rss = [ list(t) for t in sorted(zip(importances[crit], xLabels), reverse=True) ] coeff = [] feature = [] for j in range(0, len(sorted_rss)): coeff.append(abs(sorted_rss[j][0])) feature.append(featureToLabel[sorted_rss[j][1]]) plt.clf() plt.xticks(x, feature, rotation='vertical') plt.bar(x, coeff, align='center', alpha=0.5) plt.xlabel('Features') label = "Importance (" + crit + ")" plt.ylabel(label) plt.tight_layout() label = "mars_imp_" + crit plt.show() plt.savefig(image_path.format(image_num), bbox_inches='tight') image_num += 1 return r2, mse
def test_missing_data(): earth = Earth(allow_missing=True, **default_params) missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool) X_ = X.copy() X_[missing_] = None earth.fit(X_, y) res = str(earth.score(X_, y)) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_missing_data.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .03)
def test_linear_fit(): from statsmodels.regression.linear_model import GLS, OLS earth = Earth(**default_params) earth.fit(X, y) earth.linear_fit(X, y) soln = OLS(y, earth.transform(X)).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0) sample_weight = 1.0 / (numpy.random.normal(size=y.shape)**2) earth.fit(X, y) earth.linear_fit(X, y, sample_weight) soln = GLS(y, earth.transform(X), 1.0 / sample_weight).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0)
class Diagnostics: def __init__(self, env, features, *args, **kwargs): self.env = env self.solution = features self.data = env.X.loc[:, features.astype(bool)].copy() self.y = self.env.y self.model = EarthModel(*args, **kwargs) self.y_pred = None self.error = None self._fit() def _fit(self): self.model.fit(self.data, self.y) self.y_pred = self.model.predict(self.data) self.error = (self.y_pred.flatten() - self.env.y.flatten()) def summary(self): return model_summary(self.model, self.data.columns).sort_values("feature") def plot_thresholds(self): return plot_thresholds(self.summary(), self.data) def plot_autocorrelations(self): from statsmodels.graphics.tsaplots import plot_pacf, plot_acf _ = plot_pacf(self.error) _ = plot_acf(self.error) def plot_qq(self): fig, ax = plt.subplots() _, (slope, intercept, r_norm) = scipy.stats.probplot(self.error, plot=ax, fit=True) print("R squared {:.4f}".format(r_norm**2)) def plot_pred(self): df = pd.DataFrame({ "predicted": self.y_pred, "True value": self.y }, index=self.data.index) return df.hvplot().opts( title="Model prediction for {}".format(self.env.target)) def score(self): mse, gvc, rsq, grsq = self.model.mse_, self.model.gcv_, self.model.rsq_, self.model.grsq_ msg = "MSE: {:.4f}, GCV: {:.4f}, RSQ:{:.4f}, GRSQ: {:.4f}".format( mse, gvc, rsq, grsq) return msg
def marsmodelorr(self, use_smY=True, slope_trunc=0.00001, savgol_window=151, savgol_order=3, ex_order=51): Xf, Yf = self.Xf_, self.Yf_ X, Y = self.X_, self.Y_ fom = {} # smooth the data smY = savgol(Y, savgol_window, savgol_order) # perform mars model = MARS() if use_smY: model.fit(X, smY) else: model.fit(X, Y) Y_h = model.predict(X) ''' calculate dydx based on mars model to get knots and intercepts as this is complicated to extract from hinge functions ''' diff1 = np.diff(Y_h) / np.diff(X) tdiff1 = diff1 - np.nanmin(diff1) tdiff1 = tdiff1 / np.nanmax(tdiff1) #calculate slopes of linear segments ID = [i for i in range(1, len(tdiff1)) if np.abs(tdiff1[i] - tdiff1[i - 1]) > slope_trunc] ID.insert(0, 0) ID.append(np.argmax(X)) # this might cause an error slopes = [np.nanmean(diff1[ID[i - 1]:ID[i]]) for i in range(1, len(ID) - 1)] a = [Y_h[ID[i]] - slopes[i] * X[ID[i]] for i in range(len(ID) - 2)] IDM, IDm = np.argmax(slopes), np.argmin(np.abs(slopes)) # intercept of highest slope and zero as well as highest slope and lowest slope fom['zinter'] = -a[IDM] / slopes[IDM] fom['lminter'] = (a[IDM] - a[IDm]) / (slopes[IDm] - slopes[IDM]) fom['max_slope'] = slopes[IDM] fom['curr_lminter_model'] = fom['lminter'] * slopes[IDM] + a[IDM] fom['curr_lminter_data'] = np.mean(Y[np.where(np.abs(X - fom['lminter']) < 0.5)[0]]) # calculate how the CV curves kight look like without the 'ORR part' srYs = smY - model.predict(X) srYf = savgol(Yf - model.predict(Xf), savgol_window, savgol_order) # calculate their derivative dsrYf = savgol(np.diff(srYf) / np.diff(Xf), savgol_window, savgol_order) # find the extrema in the derivatives for extraction of redox pots redID_f = argrelextrema(srYf, np.less, order=ex_order) oxID_f = argrelextrema(srYf, np.greater, order=ex_order) # calc some more foms like position of redox waves fom['redpot_f'], fom['redpot_f_var'] = np.nanmean(Xf[redID_f]), np.nanstd(Xf[redID_f]) fom['oxpot_f'], fom['oxpot_f_var'] = np.nanmean(Xf[oxID_f]), np.nanstd(Xf[oxID_f]) fom['X'], fom['Xf'] = X, Xf fom['srYs'], fom['srYf'], fom['smY'] = srYs, srYf, smY fom['Y'], fom['Yf'], fom['Y_h'] = Y, Yf, Y_h fom['noise_lvl'] = np.sum((Y_h - Y) ** 2, axis=0) self.fom = fom
def test_linear_fit(): from statsmodels.regression.linear_model import GLS, OLS earth = Earth(**default_params) earth.fit(X, y) earth._Earth__linear_fit(X, y) soln = OLS(y, earth.transform(X)).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln) ** 2), 0.0) sample_weight = 1.0 / (numpy.random.normal(size=y.shape) ** 2) earth.fit(X, y) earth._Earth__linear_fit(X, y, sample_weight) soln = GLS(y, earth.transform( X), 1.0 / sample_weight).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln) ** 2), 0.0)
def calculate_earth_error(X, y, *args, **kwargs): earth = EarthModel(*args, **kwargs) model = earth.fit(X, y) pred = model.predict(X) error = pred.flatten() - y.flatten() features = get_signifficant_features(model) return error, model, features
def test_sparse(): X_sparse = csr_matrix(X) model = Earth(**default_params) assert_raises(TypeError, model.fit, X_sparse, y) model = Earth(**default_params) model.fit(X, y) assert_raises(TypeError, model.predict, X_sparse) assert_raises(TypeError, model.predict_deriv, X_sparse) assert_raises(TypeError, model.transform, X_sparse) assert_raises(TypeError, model.score, X_sparse) model = Earth(**default_params) sample_weight = csr_matrix([1.] * X.shape[0]) assert_raises(TypeError, model.fit, X, y, sample_weight)
def test_deriv(): model = Earth(**default_params) model.fit(X, y) assert_equal(X.shape + (1,), model.predict_deriv(X).shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables=0).shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables='x0').shape) assert_equal((X.shape[0], 3, 1), model.predict_deriv(X, variables=[1, 5, 7]).shape) assert_equal((X.shape[0], 0, 1), model.predict_deriv(X, variables=[]).shape) res_deriv = model.predict_deriv(X, variables=['x2', 'x7', 'x0', 'x1']) assert_equal((X.shape[0], 4, 1), res_deriv.shape) res_deriv = model.predict_deriv(X, variables=['x0']) assert_equal((X.shape[0], 1, 1), res_deriv.shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables=[0]).shape)
def test_pickle_compatibility(): earth = Earth(**default_params) model = earth.fit(X, y) model_copy = pickle.loads(pickle.dumps(model)) assert_true(model_copy == model) assert_true( numpy.all(model.predict(X) == model_copy.predict(X))) assert_true(model.basis_[0] is model.basis_[1]._get_root()) assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())
def getTrain(trainData, testData): size_s = len(trainData) size_t = len(testData) lenY = len(testData[0]) X = numpy.zeros((size_s,lenY-1)) Y = numpy.zeros((size_s,1)) z = 0 for d in trainData: for j in range(lenY-1): X[z][j] = d[j] Y[z][0] = float(d[lenY-1]) z += 1 z = 0 dX = numpy.zeros((size_t,lenY-1)) for d in testData: for j in range(lenY-1): dX[z][j] = d[j] z += 1 model = Earth() model.fit(X,Y) y_hat = model.predict(dX) corrent = 0 for i in range(size_t): x1 = testData[i][lenY-1] x2 = y_hat[i] if x1 * x2 >= 0: corrent += 1 return corrent
def test_pandas_compatibility(): import pandas X_df = pandas.DataFrame(X) y_df = pandas.DataFrame(y) colnames = ['xx' + str(i) for i in range(X.shape[1])] X_df.columns = colnames earth = Earth(**default_params) model = earth.fit(X_df, y_df) assert_list_equal( colnames, model.forward_trace()._getstate()['xlabels'])
def test_feature_importance(): criteria = ('rss', 'gcv', 'nb_subsets') for imp in criteria: earth = Earth(feature_importance_type=imp, **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] earth = Earth(feature_importance_type=criteria, **default_params) earth.fit(X, y) assert type(earth.feature_importances_) == dict assert set(earth.feature_importances_.keys()) == set(criteria) for crit, val in earth .feature_importances_.items(): assert len(val) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='bad_name', **default_params).fit, X, y) earth = Earth(feature_importance_type=('rss',), **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='rss', enable_pruning=False, **default_params).fit, X, y)
def test_shape(): model = Earth(**default_params) model.fit(X, y) X_reduced = X[:, 0:5] assert_raises(ValueError, model.predict, X_reduced) assert_raises(ValueError, model.predict_deriv, X_reduced) assert_raises(ValueError, model.transform, X_reduced) assert_raises(ValueError, model.score, X_reduced) model = Earth(**default_params) X_subsampled = X[0:10] assert_raises(ValueError, model.fit, X_subsampled, y) model = Earth(**default_params) y_subsampled = X[0:10] assert_raises(ValueError, model.fit, X, y_subsampled) model = Earth(**default_params) sample_weights = numpy.array([1.] * len(X)) sample_weights_subsampled = sample_weights[0:10] assert_raises(ValueError, model.fit, X, y, sample_weights_subsampled)
""" ===================================================== Exporting a fitted Earth models as a sympy expression ===================================================== A simple example returning a sympy expression describing the fit of a sine function computed by Earth. """ import numpy from pyearth import Earth from pyearth import export # Create some fake data numpy.random.seed(2) m = 1000 n = 10 X = 10 * numpy.random.uniform(size=(m, n)) - 40 y = 100 * (numpy.sin((X[:, 6])) - 4.0) + 10 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=2, minspan_alpha=0.5, verbose=False) model.fit(X, y) print(model.summary()) # return sympy expression print("Resulting sympy expression:") print(export.export_sympy(model))
def test_score(): earth = Earth(**default_params) model = earth.fit(X, y) record = model.pruning_trace() rsq = record.rsq(record.get_selected()) assert_almost_equal(rsq, model.score(X, y))
class MARSInterpolant(Surrogate): """Compute and evaluate a MARS interpolant MARS builds a model of the form .. math:: \\hat{f}(x) = \\sum_{i=1}^{k} c_i B_i(x). The model is a weighted sum of basis functions :math:`B_i(x)`. Each basis function :math:`B_i(x)` takes one of the following three forms: 1. a constant 1. 2. a hinge function of the form :math:`\\max(0, x - const)` or \ :math:`\\max(0, const - x)`. MARS automatically selects variables \ and values of those variables for knots of the hinge functions. 3. a product of two or more hinge functions. These basis functions c \ an model interaction between two or more variables. :param dim: Number of dimensions :type dim: int :ivar dim: Number of dimensions :ivar num_pts: Number of points in surrogate model :ivar X: Point incorporated in surrogate model (num_pts x dim) :ivar fX: Function values in surrogate model (num_pts x 1) :ivar updated: True if model is up-to-date (no refit needed) :ivar model: Earth object """ def __init__(self, dim): self.num_pts = 0 self.X = np.empty([0, dim]) self.fX = np.empty([0, 1]) self.dim = dim self.updated = False try: from pyearth import Earth self.model = Earth() except ImportError as err: print("Failed to import pyearth") raise err def _fit(self): """Compute new coefficients if the MARS interpolant is not updated.""" with warnings.catch_warnings(): warnings.simplefilter("ignore") # Surpress deprecation warnings if self.updated is False: self.model.fit(self.X, self.fX) self.updated = True def predict(self, xx): """Evaluate the MARS interpolant at the points xx :param xx: Prediction points, must be of size num_pts x dim or (dim, ) :type xx: numpy.ndarray :return: Prediction of size num_pts x 1 :rtype: numpy.ndarray """ self._fit() xx = np.atleast_2d(xx) return np.expand_dims(self.model.predict(xx), axis=1) def predict_deriv(self, xx): """Evaluate the derivative of the MARS interpolant at points xx :param xx: Prediction points, must be of size num_pts x dim or (dim, ) :type xx: numpy.array :return: Derivative of the RBF interpolant at xx :rtype: numpy.array """ self._fit() xx = np.expand_dims(xx, axis=0) dfx = self.model.predict_deriv(xx, variables=None) return dfx[0]
from sklearn import preprocessing from sklearn.feature_extraction import DictVectorizer from pyearth import Earth from matplotlib import pyplot df = pd.read_excel('relay-foods.xlsx', sheetname='Purchase Data - Full Study') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df.dtypes col_names = ['OrderDate', 'PickupDate'] df = df.drop(col_names, axis=1) y = df['TotalCharges'] df_2 = df[['OrderId', 'UserId', 'PupId']] #del df['OrderDate'] X = [dict(r.iteritems()) for _, r in df_2.iterrows()] train_fea = DictVectorizer().fit_transform(X) #Fit an Earth model model = Earth() model.fit(train_fea,y) #Print the model print(model.trace()) print(model.summary()) #Plot the model y_hat = model.predict(X)
y_mix = np.concatenate((y1[:, np.newaxis], y2[:, np.newaxis]), axis=1) alphas = [1., 0.8, 0.6, 0.4, 0.2, 0.] n_plots = len(alphas) k = 1 fig = plt.figure() for i, alpha in enumerate(alphas): # Fit an Earth model model = Earth(max_degree=5, minspan_alpha=.05, endspan_alpha=.05, max_terms=10, check_every=1, thresh=0.) output_weight = np.array([alpha, 1 - alpha]) model.fit(X, y_mix, output_weight=output_weight) print(model.summary()) # Plot the model y_hat = model.predict(X) mse = ((y_hat - y_mix) ** 2).mean(axis=0) ax = plt.subplot(n_plots, 2, k) ax.set_ylabel("Run {0}".format(i + 1), rotation=0, labelpad=20) plt.plot(X[:, 6], y_mix[:, 0], 'r.') plt.plot(X[:, 6], model.predict(X)[:, 0], 'b.') plt.title("MSE: {0:.3f}, Weight : {1:.1f}".format(mse[0], alpha)) plt.subplot(n_plots, 2, k + 1) plt.plot(X[:, 5], y_mix[:, 1], 'r.') plt.plot(X[:, 5], model.predict(X)[:, 1], 'b.') plt.title("MSE: {0:.3f}, Weight : {1:.1f}".format(mse[1], 1 - alpha))
@author: jasonrudy ''' import numpy from pyearth import Earth from matplotlib import pyplot m = 1000 x = 20*(numpy.random.uniform(size=(m,1)) - .5) y = x[:,0]*(x[:,0]<0) + x[:,0]*(x[:,0]>0) + 1*numpy.random.normal(size=m) print y.shape print y.dtype print y print x.shape print x.dtype model = Earth() model.fit(x,y) y_hat = model.predict(x) print model.trace() print model pyplot.figure(figsize=(10,5)) pyplot.plot(x[:,0],y,'r.') pyplot.plot(x[:,0],y_hat,'b.') ax = pyplot.gca() pyplot.setp(ax, frame_on=False) pyplot.savefig('demo.pdf',transparent=True)
def mars_regr(x, y): model = Earth() regr= model.fit(np.asarray(x),np.asarray(y)) return regr