def transform(self, X_dict): X = [] for i, x in enumerate(X_dict): real_period = x['period'] / x['div_period'] x_new = [x['magnitude_b'], x['magnitude_r'], real_period, x['asym_b'], x['asym_r'], x['log_p_not_variable'], x['sigma_flux_b'], x['sigma_flux_r'], x['quality'], x['div_period'] ] for color in ['r', 'b']: unfold_sample(x, color=color) x_train = x['phase_' + color] y_train = x['light_points_' + color] y_sigma = x['error_points_' + color] num_bins = 64 bins = np.linspace(0, 1, num_bins + 1) model = Earth(penalty=0.3, max_terms=10, thresh=0, smooth=True, check_every=5, max_degree=10) x_train, y_train = binify(bins, x_train, y_train) time_points_ = np.concatenate((x_train - 1., x_train, x_train + 1.), axis=0) light_points_ = np.concatenate((y_train, y_train, y_train), axis=0) model.fit(time_points_[:, np.newaxis], light_points_) t = np.arange(-1., 2., 0.01) y=model.predict(t) i_max = y.argmax() t_ = t y_ = np.concatenate( (y[i_max:], y[0:i_max]), axis=0 ) x_new.append(t[i_max]) amplitude = max(y_) - min(y_) x_new.append(amplitude) y_ /= amplitude #plt.plot(time_points_, light_points_, c='red') #plt.plot(t_, y_, c='green') #plt.show() for p in y_: x_new.append(p) X.append(x_new) return np.array(X)
def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def fit(self, X, y): self.iso_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max).fit(X,y) n = self.iso_.X_.shape[0] last = self.iso_.y_[0] current_sum = 0.0 current_count = 0 i = 0 X_ = [] y_ = [] w_ = [] while True: current = self.iso_.y_[i] if current != last: X_.append(current_sum / float(current_count)) y_.append(last) w_.append(float(current_count)) current_sum = 0.0 current_count = 0 last = current current_sum += self.iso_.X_[i] current_count += 1 i += 1 if i >= n: break self.X_ = numpy.array(X_) self.y_ = numpy.array(y_) self.w_ = numpy.array(w_) self.spline_ = Earth(**self.kwargs).fit(self.X_, self.y_, sample_weight=self.w_) return self
def mars_tune(max_degree, penalty): # Combine Earth with LogisticRegression in a pipeline to do classification clf = Pipeline([('earth', Earth(max_degree=int(max_degree), penalty=penalty)), ('logistic', LogisticRegression())]) clf.fit(x0, y0) ll = auc(y1, clf.predict_proba(x1)[:,1]) return ll
def test_gradient_boosting_estimator_with_binomial_deviance_loss(): np.random.seed(0) X, y = make_classification(n_classes=2) loss_function = BinomialDeviance(2) model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function) model.fit(X, y) assert_greater(np.sum(model.predict(X)==y) / float(y.shape[0]), .90) assert_true(np.all(0<=model.predict_proba(X))) assert_true(np.all(1>=model.predict_proba(X)))
def test_sklearn2code_export(): np.random.seed(0) X, y = make_classification(n_classes=2) X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) loss_function = BinomialDeviance(2) model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function) model.fit(X, y) code = sklearn2code(model, ['predict', 'predict_proba', 'transform'], numpy_flat) module = exec_module('test_module', code) assert_correct_exported_module(model, module, ['predict', 'predict_proba', 'transform'], dict(X=X), X)
def fit(self, X, y): if self.window_size is None: window_size = len(X) / 100 else: window_size = self.window_size order = numpy.argsort(X) y_ = moving_average(y[order], window_size) x_ = X[order][int(window_size)/2 - 1:-int(window_size)/2] self.spline_ = Earth(**self.kwargs).fit(x_, y_) return self
def test_with_response_transformation(): X, y = load_boston(return_X_y=True) log_y = np.log(y) X = pandas.DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) y = pandas.DataFrame(y, columns=['y']) transformer = VariableTransformer(dict(y=Log(Identity('y')))) model = ResponseTransformingEstimator(Earth(), transformer) model.fit(X, y) log_y_pred = model.predict(X) assert r2_score(log_y, log_y_pred) > .8 assert r2_score(y, log_y_pred) < .1
class SmoothIso(BaseEstimator, RegressorMixin): def __init__(self, y_min=None, y_max=None, **kwargs): self.y_min = y_min self.y_max = y_max self.kwargs = kwargs def fit(self, X, y): self.iso_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max).fit(X,y) n = self.iso_.X_.shape[0] last = self.iso_.y_[0] current_sum = 0.0 current_count = 0 i = 0 X_ = [] y_ = [] w_ = [] while True: current = self.iso_.y_[i] if current != last: X_.append(current_sum / float(current_count)) y_.append(last) w_.append(float(current_count)) current_sum = 0.0 current_count = 0 last = current current_sum += self.iso_.X_[i] current_count += 1 i += 1 if i >= n: break self.X_ = numpy.array(X_) self.y_ = numpy.array(y_) self.w_ = numpy.array(w_) self.spline_ = Earth(**self.kwargs).fit(self.X_, self.y_, sample_weight=self.w_) return self def predict(self, X): return self.spline_.predict(X) def transform(self, X): return self.predict(X)
class SmoothMovingAverage(BaseEstimator, RegressorMixin): def __init__(self, window_size=None, **kwargs): self.window_size = window_size self.kwargs = kwargs def fit(self, X, y): if self.window_size is None: window_size = len(X) / 100 else: window_size = self.window_size order = numpy.argsort(X) y_ = moving_average(y[order], window_size) x_ = X[order][int(window_size)/2 - 1:-int(window_size)/2] self.spline_ = Earth(**self.kwargs).fit(x_, y_) return self def predict(self, X): return self.spline_.predict(X) def transform(self, X): return self.predict(X)
train.drop('activity_id', axis=1, inplace=True) train.drop('outcome', axis=1, inplace=True) test = pd.read_csv(projPath + 'input/xtest_ds_' + dataset_version + '.csv') id_test = test.activity_id test.drop('activity_id', axis=1, inplace=True) # folds xfolds = pd.read_csv(projPath + 'input/5-fold.csv') ## model # setup model instances # Combine Earth with LogisticRegression in a pipeline to do classification earth_classifier1 = Pipeline([('earth', Earth(max_degree=1, penalty=.005)), ('logistic', LogisticRegression())]) # Combine Earth with LogisticRegression in a pipeline to do classification earth_classifier2 = Pipeline([('earth', Earth(max_degree=4, penalty=7)), ('logistic', LogisticRegression())]) stacker = BinaryStackingClassifier( [earth_classifier1, earth_classifier2], xfolds=xfolds, evaluation=auc) stacker.fit(train, y_train) meta = stacker.meta_train meta['activity_id'] = id_train meta['outcome'] = y_train
from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model.logistic import LogisticRegression from sklearn.pipeline import Pipeline from pyearth.earth import Earth print(__doc__) h = .02 # step size in the mesh np.random.seed(1) # Combine Earth with LogisticRegression in a pipeline to do classification earth_classifier = Pipeline([('earth', Earth(max_degree=3, penalty=1.5)), ('logistic', LogisticRegression())]) names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "Naive Bayes", "LDA", "QDA", "Earth" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True), SVC(gamma=2, C=1, probability=True), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), GaussianNB(), LDA(), QDA(), earth_classifier
missing = np.random.binomial(1, .1, size=X.shape) X[missing] = np.nan X = DataFrame(X, columns=['x%d' % i for i in range(n)]) return (dict(X=X, y=y), dict(X=X), dict(X=X)) def create_boston_housing(): X, y = load_boston(return_X_y=True) X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) return (dict(X=X, y=y), dict(X=X), dict(X=X)) test_cases = [ (VotingClassifier([('logistic', LogisticRegression()), ('earth', Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]))], 'hard', weights=[1.01, 1.01]), ['predict'], create_weird_classification_problem_1()), (GradientBoostingClassifier(max_depth=10, n_estimators=10), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (LogisticRegression(), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (IsotonicRegression(out_of_bounds='clip'), ['predict'], create_isotonic_regression_problem_1()), (Earth(), ['predict', 'transform'], create_regression_problem_1()), (Earth(allow_missing=True), ['predict', 'transform'], create_regression_problem_with_missingness_1()), (ElasticNet(), ['predict'], create_regression_problem_1()),
from sklearn.datasets.base import load_boston from pyearth.earth import Earth from pandas import DataFrame from sklearn2code.sklearn2code import sklearn2code from sklearn2code.languages import numpy_flat from sklearn2code.utility import exec_module from numpy.testing.utils import assert_array_almost_equal from yapf.yapflib.yapf_api import FormatCode # Load a data set. boston = load_boston() X = DataFrame(boston['data'], columns=boston['feature_names']) y = boston['target'] # Fit a py-earth model. model = Earth(max_degree=2).fit(X, y) # Generate code from the py-earth model. code = sklearn2code(model, ['predict'], numpy_flat) # Execute the generated code in its own module. boston_housing_module = exec_module('boston_housing_module', code) # Confirm that the generated module produces output identical # to the fitted model's predict method. assert_array_almost_equal(model.predict(X), boston_housing_module.predict(**X)) # Print the generated code (using yapf for formatting). print(FormatCode(code, style_config='pep8')[0])