def test_export_python_string(): for smooth in (True, False): model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y) export_model = export_python_string(model, 'my_test_model') six.exec_(export_model, globals()) for exp_pred, model_pred in zip(model.predict(X), my_test_model(X)): assert_almost_equal(exp_pred, model_pred)
def test_pathological_cases(): import pandas directory = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'pathological_data') cases = {'issue_44': {}, 'issue_50': {'penalty': 0.5, 'minspan': 1, 'allow_linear': False, 'endspan': 1, 'check_every': 1, 'sample_weight': 'issue_50_weight.csv'}} for case, settings in cases.iteritems(): data = pandas.read_csv(os.path.join(directory, case + '.csv')) y = data['y'] del data['y'] X = data if 'sample_weight' in settings: filename = os.path.join(directory, settings['sample_weight']) sample_weight = pandas.read_csv(filename)['sample_weight'] del settings['sample_weight'] else: sample_weight = None model = Earth(**settings) model.fit(X, y, sample_weight=sample_weight) with open(os.path.join(directory, case + '.txt'), 'r') as infile: correct = infile.read() assert_equal(model.summary(), correct)
def test_copy_compatibility(): model = Earth(**default_params).fit(X, y) model_copy = copy.copy(model) assert_true(model_copy == model) assert_true( numpy.all(model.predict(X) == model_copy.predict(X))) assert_true(model.basis_[0] is model.basis_[1]._get_root()) assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.trace()) + '\n' + model.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def test_pickle_compatibility(): earth = Earth(**default_params) model = earth.fit(X, y) model_copy = pickle.loads(pickle.dumps(model)) assert_true(model_copy == model) assert_true( numpy.all(model.predict(X) == model_copy.predict(X))) assert_true(model.basis_[0] is model.basis_[1]._get_root()) assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())
def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def run_pyearth(X, y, **kwargs): '''Run with pyearth. Return prediction value, training time, and number of forward pass iterations.''' model = Earth(**kwargs) t0 = time.time() model.fit(X, y) t1 = time.time() y_pred = model.predict(X) forward_iterations = len(model.forward_trace()) - 1 return y_pred, t1 - t0, forward_iterations
def test_exhaustive_search(): model = Earth(max_terms=13, enable_pruning=False, check_every=1, thresh=0, minspan=1, endspan=1) model.fit(X, y) assert_equal(model.basis_.plen(), model.coef_.shape[1]) assert_equal(model.transform(X).shape[1], len(model.basis_))
def test_nb_terms(): for max_terms in (1, 3, 12, 13): model = Earth(max_terms=max_terms) model.fit(X, y) assert_true(len(model.basis_) <= max_terms) assert_true(len(model.coef_) <= len(model.basis_)) assert_true(len(model.coef_) >= 1) if max_terms == 1: assert_list_almost_equal_value(model.predict(X), y.mean())
def test_feature_importance(): criteria = ('rss', 'gcv', 'nb_subsets') for imp in criteria: earth = Earth(feature_importance_type=imp, **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] earth = Earth(feature_importance_type=criteria, **default_params) earth.fit(X, y) assert type(earth.feature_importances_) == dict assert set(earth.feature_importances_.keys()) == set(criteria) for crit, val in earth .feature_importances_.items(): assert len(val) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='bad_name', **default_params).fit, X, y) earth = Earth(feature_importance_type=('rss',), **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='rss', enable_pruning=False, **default_params).fit, X, y)
def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def test_pandas_compatibility(): import pandas X_df = pandas.DataFrame(X) y_df = pandas.DataFrame(y) colnames = ['xx' + str(i) for i in range(X.shape[1])] X_df.columns = colnames earth = Earth(**default_params) model = earth.fit(X_df, y_df) assert_list_equal( colnames, model.forward_trace()._getstate()['xlabels'])
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.rsq_) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .05)
def test_linvars(): earth = Earth(**default_params) earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_linvars_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def test_untrained(): model = Earth(**default_params) assert_raises(NotFittedError, model.predict, X) assert_raises(NotFittedError, model.transform, X) assert_raises(NotFittedError, model.predict_deriv, X) assert_raises(NotFittedError, model.score, X) # the following should be changed to raise NotFittedError assert_equal(model.forward_trace(), None) assert_equal(model.pruning_trace(), None) assert_equal(model.summary(), "Untrained Earth Model")
def runModel(i,featureCombo): mae = np.array([]) logging.warning('try alpha = %s' % i) for ktrain,ktest in kf: x = trainCleaned.iloc[ktrain,] y = trainCleaned.iloc[ktest,] model = Earth() model.fit(x[featureCombo],x['Expected']) pred = model.predict(y[featureCombo]) mae = np.append(mae,(getMAE(pred,y['Expected']))) logging.warning('average 10-fold MAE for alpha %s feature %s' % (i,featureCombo)) logging.warning(mae.mean())
def test_nb_degrees(): for max_degree in (1, 2, 12, 13): model = Earth(max_terms=10, max_degree=max_degree, enable_pruning=False, check_every=1, thresh=0, minspan=1, endspan=1) model.fit(X, y) for basis in model.basis_: assert_true(basis.degree() >= 0) assert_true(basis.degree() <= max_degree)
def test_missing_data(): earth = Earth(allow_missing=True, **default_params) missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool) X_ = X.copy() X_[missing_] = None earth.fit(X_, y) res = str(earth.score(X_, y)) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_missing_data.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .03)
def test_eq(): model1 = Earth(**default_params) model2 = Earth(**default_params) assert_equal(model1, model2) assert_not_equal(model1, 5) params = {} params.update(default_params) params["penalty"] = 15 model2 = Earth(**params) assert_not_equal(model1, model2) model3 = Earth(**default_params) model3.unknown_parameter = 5 assert_not_equal(model1, model3)
def test_output_weight(): x = numpy.random.uniform(-1, 1, size=(1000, 1)) y = (numpy.dot(x, numpy.random.normal(0, 1, size=(1, 10)))) ** 5 + 1 y = (y - y.mean(axis=0)) / y.std(axis=0) group = numpy.array([1] * 5 + [0] * 5) output_weight = numpy.array([1] * 5 + [2] * 5, dtype=float) model = Earth().fit(x, y, output_weight=output_weight) # Check that the model fits at least better # the more heavily weighted group mse = ((model.predict(x) - y)**2).mean(axis=0) group1_mean = mse[group].mean() group2_mean = mse[numpy.logical_not(group)].mean() assert_true(group1_mean > group2_mean or round(abs(group1_mean - group2_mean), 7) == 0)
def test_patsy_compatibility(): import pandas import patsy X_df = pandas.DataFrame(X) y_df = pandas.DataFrame(y) colnames = ['xx' + str(i) for i in range(X.shape[1])] X_df.columns = colnames X_df['y'] = y y_df, X_df = patsy.dmatrices( 'y ~ xx0 + xx1 + xx2 + xx3 + xx4 + xx5 + xx6 + xx7 + xx8 + xx9 - 1', data=X_df) model = Earth(**default_params).fit(X_df, y_df) assert_list_equal( colnames, model.forward_trace()._getstate()['xlabels'])
def test_sparse(): X_sparse = csr_matrix(X) model = Earth(**default_params) assert_raises(TypeError, model.fit, X_sparse, y) model = Earth(**default_params) model.fit(X, y) assert_raises(TypeError, model.predict, X_sparse) assert_raises(TypeError, model.predict_deriv, X_sparse) assert_raises(TypeError, model.transform, X_sparse) assert_raises(TypeError, model.score, X_sparse) model = Earth(**default_params) sample_weight = csr_matrix([1.] * X.shape[0]) assert_raises(TypeError, model.fit, X, y, sample_weight)
def __init__(self, maxp=100): self.nump = 0 self.maxp = maxp self.x = None # pylint: disable=invalid-name self.fx = None self.dim = None self.model = Earth() self.updated = False
def getTrain(trainData, testData): size_s = len(trainData) size_t = len(testData) lenY = len(testData[0]) X = numpy.zeros((size_s,lenY-1)) Y = numpy.zeros((size_s,1)) z = 0 for d in trainData: for j in range(lenY-1): X[z][j] = d[j] Y[z][0] = float(d[lenY-1]) z += 1 z = 0 dX = numpy.zeros((size_t,lenY-1)) for d in testData: for j in range(lenY-1): dX[z][j] = d[j] z += 1 model = Earth() model.fit(X,Y) y_hat = model.predict(dX) corrent = 0 for i in range(size_t): x1 = testData[i][lenY-1] x2 = y_hat[i] if x1 * x2 >= 0: corrent += 1 return corrent
def test_export_sympy(): import pandas as pd from sympy.utilities.lambdify import lambdify from sympy.printing.lambdarepr import NumPyPrinter class PyEarthNumpyPrinter(NumPyPrinter): def _print_Max(self, expr): return 'maximum(' + ','.join(self._print(i) for i in expr.args) + ')' def _print_NaNProtect(self, expr): return 'where(isnan(' + ','.join(self._print(a) for a in expr.args) + '), 0, ' \ + ','.join(self._print(a) for a in expr.args) + ')' def _print_Missing(self, expr): return 'isnan(' + ','.join(self._print(a) for a in expr.args) + ').astype(float)' for smooth, n_cols, allow_missing in product((True, False), (1, 2), (True, False)): X_df = pd.DataFrame(X.copy(), columns=['x_%d' % i for i in range(X.shape[1])]) y_df = pd.DataFrame(Y[:, :n_cols]) if allow_missing: # Randomly remove some values so that the fitted model contains MissingnessBasisFunctions X_df['x_1'][numpy.random.binomial(n=1, p=.1, size=X_df.shape[0]).astype(bool)] = numpy.nan model = Earth(allow_missing=allow_missing, smooth=smooth, max_degree=2).fit(X_df, y_df) expressions = export_sympy(model) if n_cols > 1 else [export_sympy(model)] module_dict = {'select': numpy.select, 'less_equal': numpy.less_equal, 'isnan': numpy.isnan, 'greater_equal':numpy.greater_equal, 'logical_and': numpy.logical_and, 'less': numpy.less, 'logical_not':numpy.logical_not, "greater": numpy.greater, 'maximum':numpy.maximum, 'Missing': lambda x: numpy.isnan(x).astype(float), 'NaNProtect': lambda x: numpy.where(numpy.isnan(x), 0, x), 'nan': numpy.nan, 'float': float, 'where': numpy.where } for i, expression in enumerate(expressions): # The lambdified functions for smoothed basis functions only work with modules='numpy' and # for regular basis functions with modules={'Max':numpy.maximum}. This is a confusing situation func = lambdify(X_df.columns, expression, printer=PyEarthNumpyPrinter, modules=module_dict) y_pred_sympy = func(*[X_df.loc[:,var] for var in X_df.columns]) y_pred = model.predict(X_df)[:,i] if n_cols > 1 else model.predict(X_df) assert_array_almost_equal(y_pred, y_pred_sympy)
def test_untrained(): # NotFittedError moved from utils.validation to exceptions # some time after 0.17.1 try: from sklearn.exceptions import NotFittedError except ImportError: from sklearn.utils.validation import NotFittedError # Make sure calling methods that require a fitted Earth object # raises the appropriate exception when using a not yet fitted # Earth object model = Earth(**default_params) assert_raises(NotFittedError, model.predict, X) assert_raises(NotFittedError, model.transform, X) assert_raises(NotFittedError, model.predict_deriv, X) assert_raises(NotFittedError, model.score, X) # the following should be changed to raise NotFittedError assert_equal(model.forward_trace(), None) assert_equal(model.pruning_trace(), None) assert_equal(model.summary(), "Untrained Earth Model")
def test_xlabels(): model = Earth(**default_params) assert_raises(ValueError, model.fit, X[:, 0:5], y, xlabels=['var1', 'var2']) model = Earth(**default_params) model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3']) model = Earth(**default_params) model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3'])
def marsmodelorr(self, use_smY=True, slope_trunc=0.00001, savgol_window=151, savgol_order=3, ex_order=51): Xf, Yf = self.Xf_, self.Yf_ X, Y = self.X_, self.Y_ fom = {} # smooth the data smY = savgol(Y, savgol_window, savgol_order) # perform mars model = MARS() if use_smY: model.fit(X, smY) else: model.fit(X, Y) Y_h = model.predict(X) ''' calculate dydx based on mars model to get knots and intercepts as this is complicated to extract from hinge functions ''' diff1 = np.diff(Y_h) / np.diff(X) tdiff1 = diff1 - np.nanmin(diff1) tdiff1 = tdiff1 / np.nanmax(tdiff1) #calculate slopes of linear segments ID = [i for i in range(1, len(tdiff1)) if np.abs(tdiff1[i] - tdiff1[i - 1]) > slope_trunc] ID.insert(0, 0) ID.append(np.argmax(X)) # this might cause an error slopes = [np.nanmean(diff1[ID[i - 1]:ID[i]]) for i in range(1, len(ID) - 1)] a = [Y_h[ID[i]] - slopes[i] * X[ID[i]] for i in range(len(ID) - 2)] IDM, IDm = np.argmax(slopes), np.argmin(np.abs(slopes)) # intercept of highest slope and zero as well as highest slope and lowest slope fom['zinter'] = -a[IDM] / slopes[IDM] fom['lminter'] = (a[IDM] - a[IDm]) / (slopes[IDm] - slopes[IDM]) fom['max_slope'] = slopes[IDM] fom['curr_lminter_model'] = fom['lminter'] * slopes[IDM] + a[IDM] fom['curr_lminter_data'] = np.mean(Y[np.where(np.abs(X - fom['lminter']) < 0.5)[0]]) # calculate how the CV curves kight look like without the 'ORR part' srYs = smY - model.predict(X) srYf = savgol(Yf - model.predict(Xf), savgol_window, savgol_order) # calculate their derivative dsrYf = savgol(np.diff(srYf) / np.diff(Xf), savgol_window, savgol_order) # find the extrema in the derivatives for extraction of redox pots redID_f = argrelextrema(srYf, np.less, order=ex_order) oxID_f = argrelextrema(srYf, np.greater, order=ex_order) # calc some more foms like position of redox waves fom['redpot_f'], fom['redpot_f_var'] = np.nanmean(Xf[redID_f]), np.nanstd(Xf[redID_f]) fom['oxpot_f'], fom['oxpot_f_var'] = np.nanmean(Xf[oxID_f]), np.nanstd(Xf[oxID_f]) fom['X'], fom['Xf'] = X, Xf fom['srYs'], fom['srYf'], fom['smY'] = srYs, srYf, smY fom['Y'], fom['Yf'], fom['Y_h'] = Y, Yf, Y_h fom['noise_lvl'] = np.sum((Y_h - Y) ** 2, axis=0) self.fom = fom
def __init__(self, dim): self.num_pts = 0 self.X = np.empty([0, dim]) self.fX = np.empty([0, 1]) self.dim = dim self.updated = False try: from pyearth import Earth self.model = Earth() except ImportError as err: print("Failed to import pyearth") raise err
def test_fast(): earth = Earth(max_terms=10, max_degree=5, **default_params) earth.fit(X, y) normal_summary = earth.summary() earth = Earth(use_fast=True, max_terms=10, max_degree=5, fast_K=10, fast_h=1, **default_params) earth.fit(X, y) fast_summary = earth.summary() assert_equal(normal_summary, fast_summary)
def test_deriv(): model = Earth(**default_params) model.fit(X, y) assert_equal(X.shape + (1, ), model.predict_deriv(X).shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables=0).shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables='x0').shape) assert_equal((X.shape[0], 3, 1), model.predict_deriv(X, variables=[1, 5, 7]).shape) assert_equal((X.shape[0], 0, 1), model.predict_deriv(X, variables=[]).shape) res_deriv = model.predict_deriv(X, variables=['x2', 'x7', 'x0', 'x1']) assert_equal((X.shape[0], 4, 1), res_deriv.shape) res_deriv = model.predict_deriv(X, variables=['x0']) assert_equal((X.shape[0], 1, 1), res_deriv.shape) assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables=[0]).shape)
model = Earth(max_terms=50, max_degree=3) model.fit(X,y) #Print the model #print(model.trace()) print(model.summary()) print "MARS degree 5" model = Earth(max_terms=20, max_degree=5) model.fit(X,y) #Print the model #print(model.trace()) print(model.summary()) """ print "=====================================" print "MARS degree 1" model = Earth(max_terms=70, max_degree=1) print "Score: {}".format ( crossValidation ( model, X, y ) ) print "MARS degree 3" model = Earth(max_terms=50, max_degree=3) crossValidation ( model, X, y ) print "Score: {}".format ( crossValidation ( model, X, y ) )
X = np.array(X) y = np.sin(X) + np.random.normal(size=X.shape[0])/10.0 #Defining different knots which will be used as a parameter for MARS model knots = [2,4,5,10] #Helpful in creating graph axis = [[0,0],[0,1],[1,0],[1,1]] #Defining different max_degree parameter for MARS model parameter for degree in range(1,5): fig,ax = plt.subplots(2,2,figsize=(10, 10)) for num_knot in range(4): # Defining MARS model with max_term and max_degree parameter model = Earth(max_terms=knots[num_knot],max_degree=degree,verbose=0) #Fitting the dataset on the dataset model.fit(X, y) #Prediction model output y_hat = model.predict(X) #Potting graphs ax[axis[num_knot][0],axis[num_knot][1]].title.set_text(f"degree = {degree}, knots = {knots[num_knot]}") ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y,'r.') ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y_hat,'b.') plt.show() # Plotting dataset distribution plt.figure()
def translation_correction(cell_mesh, cell_mesh_2, buffer_cell,\ x_pos, y_pos, z_pos, x_pos_new, y_pos_new, z_pos_new, closest_no_conflict, directory ): x_min = np.min([np.min(cell_mesh[:,0]),np.min(cell_mesh_2[:,0])]) - buffer_cell x_max = np.max([np.max(cell_mesh[:,0]),np.max(cell_mesh_2[:,0])]) + buffer_cell y_min = np.min([np.min(cell_mesh[:,1]),np.min(cell_mesh_2[:,1])]) - buffer_cell y_max = np.max([np.max(cell_mesh[:,1]),np.max(cell_mesh_2[:,1])]) + buffer_cell z_min = np.min([np.min(cell_mesh[:,2]),np.min(cell_mesh_2[:,2])]) - buffer_cell z_max = np.max([np.max(cell_mesh[:,2]),np.max(cell_mesh_2[:,2])]) + buffer_cell num_pts = len(x_pos) X = []; Y = []; Z = []; U = []; V = []; W = [] for kk in range(0,num_pts): idx = closest_no_conflict[kk] if idx < len(closest_no_conflict): U.append(x_pos_new[idx] - x_pos[kk]) V.append(y_pos_new[idx] - y_pos[kk]) W.append(z_pos_new[idx] - z_pos[kk]) X.append(x_pos_new[idx]); Y.append(y_pos_new[idx]); Z.append(z_pos_new[idx]) # --> limit to points that aren't too close to the cell X_safe = []; Y_safe = []; Z_safe = []; U_safe = []; V_safe = []; W_safe = [] num_pts = len(U) for kk in range(0,num_pts): x_out = X[kk] < x_min or X[kk] > x_max y_out = Y[kk] < y_min or Y[kk] > y_max z_out = Z[kk] < z_min or Z[kk] > z_max if x_out or y_out or z_out: X_safe.append(X[kk]) Y_safe.append(Y[kk]) Z_safe.append(Z[kk]) U_safe.append(U[kk]) V_safe.append(V[kk]) W_safe.append(W[kk]) X_safe = np.asarray(X_safe); Y_safe = np.asarray(Y_safe); Z_safe = np.asarray(Z_safe) U_safe = np.asarray(U_safe); V_safe = np.asarray(V_safe); W_safe = np.asarray(W_safe) # --> fit MARS models model_U = Earth(max_degree=2,max_terms=10) model_U.fit(Z_safe,U_safe) model_V = Earth(max_degree=2,max_terms=10) model_V.fit(Z_safe,V_safe) model_W = Earth(max_degree=2,max_terms=10) model_W.fit(Z_safe,W_safe) # --> re-define Z pred_U = model_U.predict(z_pos_new) pred_V = model_V.predict(z_pos_new) pred_W = model_W.predict(z_pos_new) # --> correct new bead positions for kk in range(0,len(x_pos_new)): x_pos_new[kk] = x_pos_new[kk] - pred_U[kk] y_pos_new[kk] = y_pos_new[kk] - pred_V[kk] z_pos_new[kk] = z_pos_new[kk] - pred_W[kk] # --> correct new cell position pred_cell_0 = model_U.predict(cell_mesh_2[:,0]) pred_cell_1 = model_V.predict(cell_mesh_2[:,1]) pred_cell_2 = model_W.predict(cell_mesh_2[:,2]) cell_mesh_2_new = np.zeros(cell_mesh_2.shape) cell_mesh_2_new[:,0] = cell_mesh_2[:,0] - pred_cell_0 cell_mesh_2_new[:,1] = cell_mesh_2[:,1] - pred_cell_1 cell_mesh_2_new[:,2] = cell_mesh_2[:,2] - pred_cell_2 # --> plot MARS models Z_line = np.linspace(np.min(Z),np.max(Z),100) pred_line_U = model_U.predict(Z_line) pred_line_V = model_V.predict(Z_line) pred_line_W = model_W.predict(Z_line) plt.figure(figsize=(15,5)) plt.subplot(1,3,1) plt.plot(Z,U,'b.',label='x raw') plt.plot(Z_line,pred_line_U,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('x displacements') plt.subplot(1,3,2) plt.plot(Z,V,'r.',label='y raw') plt.plot(Z_line,pred_line_V,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('y displacements') plt.subplot(1,3,3) plt.plot(Z,W,'g.',label='z raw') plt.plot(Z_line,pred_line_W,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('z displacements') plt.savefig(directory + '/translation_correction.png') return x_pos_new, y_pos_new, z_pos_new, cell_mesh_2_new
test = pd.read_csv('../data/modeltest.csv',index_col=0) label = train['Response'].values featextra= pd.read_csv('../feat/improve.csv',index_col=0) train = pd.concat([train,featextra.loc[train.index]],axis=1) test = pd.concat([test,featextra.loc[test.index]],axis=1) featextra= pd.read_csv('../feat/duplicate.csv',index_col=0) train = pd.concat([train,featextra.loc[train.index]],axis=1) test = pd.concat([test,featextra.loc[test.index]],axis=1) feat = train.columns.drop('Response',1) #Build an Earth model with a logisticregression pipeline earth_pipe = Pipeline([('earth',Earth(use_fast=True,allow_missing=True,penalty=0.5,max_degree=3)),('log',LogisticRegression())]) earth_pipe.fit(train[feat],label) #Parameter tuning #param_grid = {'earth__penalty': np.arange(1,11,2),'earth__max_degree': range(1,4)} # #gs1 = GridSearchCV(earth_pipe,param_grid,n_jobs=1,pre_dispatch=1,cv=StratifiedKFold(label, n_folds=5, shuffle=True),scoring='log_loss',verbose=2) # # #gs1.fit(train[feat],label) # #print gs1.best_params_ #print gs1.best_score_ # ##----------------------------------------------------------
def fit_with_subdata(scaledown, offset): # retrieve training data and official reco hadronic energy for comparison X = root2array('../training_data.root', branches='calehad', selection='mustopz<1275&&isnumucc==1', step=scaledown, start=offset).reshape(-1,1) recoemu_official = root2array('../training_data.root', branches='recoemu', selection='mustopz<1275&&isnumucc==1', step=scaledown, start=offset) trueenu = root2array('../training_data.root', branches='trueenu', selection='mustopz<1275&&isnumucc==1', step=scaledown, start=offset) y = trueenu - recoemu_official yoff = root2array('../training_data.root', branches='recoehad', selection='mustopz<1275&&isnumucc==1', step=scaledown, start=offset) # train svm with standardized regressors mars = Earth() mars.fit(X, y) # save the model os.system('mkdir -p models/1d') modelpn = 'models/1d/hadronic_1d_energy_estimator_step{}offset{}.pkl'.format(scaledown, offset) joblib.dump(mars, modelpn) # estimate reco value yest = mars.predict(X) rest = (yest-y)/y roff = (yoff-y)/y # save root file os.system('mkdir -p output_root_files/1d') toutf = TFile('output_root_files/1d/resolution_1d_step{}offset{}.root'.format(scaledown, offset), 'recreate') tr = TTree( 'tr', 'resolution tree' ) r1 = array( 'f', [ 0. ] ) r2 = array( 'f', [ 0. ] ) marsehad = array( 'f', [ 0. ] ) offehad = array( 'f', [ 0. ] ) trueehad = array( 'f', [ 0. ] ) tr.Branch( 'rest', r1, 'rest/F' ) tr.Branch( 'roff', r2, 'roff/F' ) tr.Branch('marsehad', marsehad, 'marsehad/F') tr.Branch('offehad', offehad, 'offehad/F') tr.Branch('trueehad', trueehad, 'trueehad/F') for i in range(len(rest)): r1[0] = rest[i] r2[0] = roff[i] marsehad[0] = yest[i] offehad[0] = yoff[i] trueehad[0] = y[i] tr.Fill() tr.Write() toutf.Close() # print out the statistics os.system('mkdir -p performance_figures/1d') with open('performance_figures/1d/1d_step{}offset{}.txt'.format(scaledown, offset), 'w') as outf: outf.write(str(np.mean(rest))+'\n') outf.write(str(tstd(rest))+'\n') outf.write(str(skew(rest))+'\n') outf.write(str(kurtosis(rest))+'\n')
clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outElasticNet.csv') regr_2.fit(X_train, y_train) y_eval = regr_2.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions' ]).to_csv('outAdaBoostRegressor.csv') clf = linear_model.Lars(n_nonzero_coefs=1) clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outLARS.csv') """ clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng) clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outAdaBoostRegressor.csv') """ from pyearth import Earth clf = Earth() clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outMARS.csv')
class MARSInterpolant(Surrogate): """Compute and evaluate a MARS interpolant MARS builds a model of the form .. math:: \\hat{f}(x) = \\sum_{i=1}^{k} c_i B_i(x). The model is a weighted sum of basis functions :math:`B_i(x)`. Each basis function :math:`B_i(x)` takes one of the following three forms: 1. a constant 1. 2. a hinge function of the form :math:`\\max(0, x - const)` or \ :math:`\\max(0, const - x)`. MARS automatically selects variables \ and values of those variables for knots of the hinge functions. 3. a product of two or more hinge functions. These basis functions c \ an model interaction between two or more variables. :param dim: Number of dimensions :type dim: int :param lb: Lower variable bounds :type lb: numpy.array :param ub: Upper variable bounds :type ub: numpy.array :param output_transformation: Transformation applied to values before fitting :type output_transformation: Callable :ivar dim: Number of dimensions :ivar lb: Lower variable bounds :ivar ub: Upper variable bounds :ivar output_transformation: Transformation to apply to function values before fitting :ivar num_pts: Number of points in surrogate model :ivar X: Point incorporated in surrogate model (num_pts x dim) :ivar fX: Function values in surrogate model (num_pts x 1) :ivar updated: True if model is up-to-date (no refit needed) :ivar model: Earth object """ def __init__(self, dim, lb, ub, output_transformation=None): super().__init__(dim=dim, lb=lb, ub=ub, output_transformation=output_transformation) try: from pyearth import Earth self.model = Earth() except ImportError as err: print("Failed to import pyearth") raise err def _fit(self): """Compute new coefficients if the MARS interpolant is not updated.""" with warnings.catch_warnings(): warnings.simplefilter("ignore") # Surpress deprecation warnings if self.updated is False: fX = self.output_transformation(self.fX.copy()) self.model.fit(self._X, fX) self.updated = True def predict(self, xx): """Evaluate the MARS interpolant at the points xx :param xx: Prediction points, must be of size num_pts x dim or (dim, ) :type xx: numpy.ndarray :return: Prediction of size num_pts x 1 :rtype: numpy.ndarray """ self._fit() xx = to_unit_box(np.atleast_2d(xx), self.lb, self.ub) return np.expand_dims(self.model.predict(xx), axis=1) def predict_deriv(self, xx): """Evaluate the derivative of the MARS interpolant at points xx :param xx: Prediction points, must be of size num_pts x dim or (dim, ) :type xx: numpy.array :return: Derivative of the RBF interpolant at xx :rtype: numpy.array """ self._fit() xx = to_unit_box(np.atleast_2d(xx), self.lb, self.ub) dfx = self.model.predict_deriv(xx, variables=None) return dfx[0] / (self.ub - self.lb)
def fit(self): self.classifier = Earth() self.classifier.fit(self.x_train, self.y_train)
olsMSE = kFoldValidation(5, ols, array_x_train, array_y_train) olsMSE # array([2.7911842 , 2.76834881, 2.84893447, 2.78335565, 2.73966849]) # OLS using only the 14 top correlated variables sub_x_train = x_train[top[1:]] array_x_sub = np.array(sub_x_train) olsMSE = kFoldValidation(5, ols, array_x_sub, array_y_train) olsMSE # array([2.82931072, 2.80517518, 2.88589756, 2.82362708, 2.78061219]) # It seems that the model using all variables performs better # 2. Spline # Since it is too slow to do the k cross validation for spline, # just use validation set to test the performance. spline = Earth() sub_cols = list(top[1:]) # Uses highly-correlated variables to build the model sub_x_train = x_train[sub_cols] array_sub_x = np.array(sub_x_train) spline.fit(sub_x_train, y_train) preds_val = spline.predict(x_val[sub_cols]) splineMSE = np.mean( (preds_val - array_y_val.ravel())**2) # Calculates the mean squared error splineMSE # 2.457458862928802 # 3. Random Forest # Since it is too slow to do the k cross validation for random forest, # just use validation set to test the performance. # Builds the model with 50 trees rf = RandomForestRegressor(max_depth=20, random_state=42, n_estimators=50)
xArray = [] yArray = [] #seperating X and Y from the dataset for eachXYPAir in dataset: x = eachXYPAir[0] y = eachXYPAir[1] xArray.append(x) yArray.append(y) # print len(xArray) xArray = numpy.asarray(xArray, "float32") # converting to numpy array # print len(yArray) yArray = numpy.asarray(yArray, "float32") # converting to numpy array # Fit an Earth model model = Earth(max_degree=1, verbose=True) # initializing py- earth package # making model for the data model.fit(xArray, yArray) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(xArray) # print y_hat plt.figure() plt.plot(xArray, yArray, 'r.') plt.plot(xArray, y_hat, 'b.') plt.show()
import numpy from pyearth import Earth from matplotlib import pyplot #Create some fake data numpy.random.seed(0) m = 1000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m) #Fit an Earth model model = Earth() model.fit(X, y) #Print the model print(model.trace()) print(model.summary()) #Plot the model y_hat = model.predict(X) pyplot.figure() pyplot.plot(X[:, 6], y, 'r.') pyplot.plot(X[:, 6], y_hat, 'b.') pyplot.xlabel('x_6') pyplot.ylabel('y') pyplot.title('Simple Earth Example') pyplot.show()
def test_linear_fit(): from statsmodels.regression.linear_model import GLS, OLS earth = Earth(**default_params) earth.fit(X, y) earth.linear_fit(X, y) soln = OLS(y, earth.transform(X)).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0) sample_weight = 1.0 / (numpy.random.normal(size=y.shape)**2) earth.fit(X, y) earth.linear_fit(X, y, sample_weight) soln = GLS(y, earth.transform(X), 1.0 / sample_weight).fit().params assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0)
def test_feature_importance(): criteria = ('rss', 'gcv', 'nb_subsets') for imp in criteria: earth = Earth(feature_importance_type=imp, **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] earth = Earth(feature_importance_type=criteria, **default_params) earth.fit(X, y) assert type(earth.feature_importances_) == dict assert set(earth.feature_importances_.keys()) == set(criteria) for crit, val in earth.feature_importances_.items(): assert len(val) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='bad_name', **default_params).fit, X, y) earth = Earth(feature_importance_type=('rss', ), **default_params) earth.fit(X, y) assert len(earth.feature_importances_) == X.shape[1] assert_raises( ValueError, Earth(feature_importance_type='rss', enable_pruning=False, **default_params).fit, X, y)
# Create some fake data numpy.random.seed(2) m = 10000 n = 10 X = numpy.random.uniform(size=(m, n)) y = (10 * numpy.sin(numpy.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + numpy.random.uniform(size=m)) # Fit an Earth model criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(max_degree=3, max_terms=10, minspan_alpha=.5, feature_importance_type=criteria, verbose=True) model.fit(X, y) rf = RandomForestRegressor() rf.fit(X, y) # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances(sort_by='gcv')) # Plot the feature importances importances = model.feature_importances_ importances['random_forest'] = rf.feature_importances_ criteria = criteria + ('random_forest',) idx = 1
def test_export_python_function(): for smooth in (True, False): model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y) export_model = export_python_function(model) for exp_pred, model_pred in zip(model.predict(X), export_model(X)): assert_almost_equal(exp_pred, model_pred)
# train = pd.read_csv('boston_data.csv') # X = np.array(train.iloc[:, 0:13]) # y = np.array(train.iloc[:, 13]) # # test = pd.read_csv('boston_test_data.csv') # X_test = np.array(test.iloc[:, 0:13]) # X_test_id = test.iloc[:, 0] np.random.seed(0) m = 1000 n = 10 X = 80 * np.random.uniform(size=(m, n)) - 40 y = np.abs(X[:, 6] - 4.0) + 1 * np.random.normal(size=m) #Fit an Earth model model = Earth() model.fit(X, y) #Print the model print(model.trace()) print(model.summary()) X, y = load_boston(return_X_y=True) model_rsq_dic = {} # # % lower status of the population lstat_x = [] [lstat_x.append(row[12]) for row in X] lstat_x = np.array(lstat_x).reshape(-1, 1)
def csc(df, hamming_string_dict, outdir, filename): """CRISPR Specificity Correction :param df: pandas dataframe with first column as gRNA and second column as logFC/metric :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics :param outdir: absolute filepath to output directory :param filename: name of input file to be used as part of output filename :return: CSC adjustment """ # MARS compatible file df_mars_lst = [] df_v = np.asarray(df) for i in range(len(df_v)): row_lst = [] grna, metric = df_v[i][0], df_v[i][1] try: metric = float(metric) except ValueError: sys.stdout.write( 'WARNING: encountered %s which is not float compatible, skipping\n' % metric) continue row_lst.append(grna) try: for jj in hamming_string_dict[grna]: row_lst.append(jj) row_lst.append(metric) df_mars_lst.append(row_lst) except KeyError: sys.stdout.write('\n%s not found in selected library: passing\n' % grna) continue df = pd.DataFrame(df_mars_lst, columns=[ 'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3', 'original_value' ]) # exclude infinte specificity non-target gRNAs df = df[df['h0'] != 0] # isolate pertinent confounder variables df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']] # knots knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1]) # training and testing data train_x, test_x, train_y, test_y = train_test_split(df_confounders, df['original_value'], test_size=0.10, random_state=1) # Fit an Earth model model = Earth(feature_importance_type='gcv') try: model.fit(train_x, train_y) except ValueError: sys.stdout.write( '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n' ) model_processed = 'F' sys.stdout.write( 'training input x data\n %s\ntraining input y data\n %s\n' % (train_x, train_y)) return model_processed # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances()) # Plot the model y_hat = model.predict(test_x) # calculating RMSE values rms1 = sqrt(mean_squared_error(test_y, y_hat)) print('\n\nRMSE on Predictions\n\n') print(rms1) # calculating R^2 for training print('\n\nR^2 on Training Data\n\n') print(model.score(train_x, train_y)) # calculating R^2 for testing print('\n\nR^2 on Testing Data\n\n') print(model.score(test_x, test_y)) # write out model metrics with open('%s/csc_model_metrics_%s.txt' % (outdir, filename), 'w') as outfile: outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' % (model.trace(), model.summary(), model.summary_feature_importances(), rms1)) if rms1 <= 1.0: #model processed model_processed = 'T' # full data prediction df['earth_adjustment'] = model.predict(df_confounders) # CSC correction df['earth_corrected'] = df['original_value'] - df['earth_adjustment'] # main write out df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename)) # pickle write out model_file = open( '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb') pl.dump(model, model_file) model_file.close() sys.stdout.write('\nCSC adjustment complete\n') sys.stdout.write('\nCSC output files written to %s\n' % outdir) return model_processed else: sys.stdout.write( '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n' ) model_processed = 'F' return model_processed
) # Select target and feature dataset(s) --> [target, feature1, feature2, ... ] datasets = [ Dataset('runoff', database), Dataset('runoff', database).normalized(), Dataset('temp', database).normalized(), Dataset('precip', database).normalized(), Dataset('season', database).normalized() ] # Select leadtimes for target and feature. negative:past/positive:future leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]] # Select Model model_type = Earth(max_degree=10, smooth=True) #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000) #model_type = Regressor( # layers=[ # Layer("Sigmoid",units=5), # Layer("Linear", units=1)], # learning_rate=0.1, # n_iter=1000) # Set training interval startyear = DateFormat(1900, 1) endyear = DateFormat(2005, 36) training_daterange = DateFormat.decadal_daterange(startyear, endyear) # Set testing interval startyear = DateFormat(2006, 1)
import numpy from pyearth import Earth from matplotlib import pyplot # Create some fake data numpy.random.seed(2) m = 10000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = 100 * \ numpy.abs(numpy.sin((X[:, 6]) / 10) - 4.0) + \ 10 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=3, minspan_alpha=.5) model.fit(X, y) # Print the model print model.trace() print model.summary() # Plot the model y_hat = model.predict(X) pyplot.figure() pyplot.plot(X[:, 6], y, 'r.') pyplot.plot(X[:, 6], y_hat, 'b.') pyplot.show()
df = pd.read_csv(dataset, sep='\t') df = pd.read_table(dataset) gt_mapping = {'0/0': 0, '0/1': 1, '1/1': 2} df['GT_GATK'] = df['GT_GATK'].map(gt_mapping) df['GT_Varscan'] = df['GT_Varscan'].map(gt_mapping) df['GT_Freebayes'] = df['GT_Freebayes'].map(gt_mapping) X = df.values[:100, 5:] X = set_missing_values(X) #print df.columns[12] y = np.random.randint(2, size=(int(np.shape(X)[0]), )) #print X #print y earth_classifier = Pipeline([('earth', Earth(allow_missing=True)), ('logistic', LogisticRegression())]) #earth_classifier = Pipeline([('earth', Earth(allow_missing=True)), # ('logistic', RandomForestClassifier())]) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) ec = earth_classifier.fit(X_train, y_train) y_hat = earth_classifier.predict(X_test)
A simple example plotting a fit of the sine function. """ import numpy import matplotlib.pyplot as plt from pyearth import Earth # Create some fake data numpy.random.seed(2) m = 10000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = 100 * \ (numpy.sin((X[:, 6])) - 4.0) + \ 10 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=3, minspan_alpha=.5, verbose=True) model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(X) plt.plot(X[:, 6], y, 'r.') plt.plot(X[:, 6], y_hat, 'b.') plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyearth import Earth ## Load Data df = pd.read_csv('hw2_data_2.txt', sep='\t') X_train, y_train = np.array(df.iloc[:700, :-1]), np.array(df.iloc[:700, -1]) X_test, y_test = np.array(df.iloc[700:, :-1]), np.array(df.iloc[700:, -1]) ## Using py-earth package, please install it follow the instructions in README file clf = Earth() clf.fit(X_train, y_train) ## Predict the value and calculate the testing error rate pred_vals = clf.predict(X_test) ## Dichotomize the predicted outcome at the median median = np.median(pred_vals) res = np.where(pred_vals >= median, 1, -1) # print(res) error_rate = 1 - sum(res == y_test) / y_test.shape[0] print("The testing error rate for MARS classifier is: %.4f" % error_rate)
import matplotlib.pyplot as plt from pyearth import Earth # Create some fake data numpy.random.seed(2) m = 10000 n = 10 X = 20 * numpy.random.uniform(size=(m, n)) - 10 y = 10 * numpy.sin(X[:, 6]) + 0.25 * numpy.random.normal(size=m) # Compute the known true derivative with respect to the predictive variable y_prime = 10 * numpy.cos(X[:, 6]) # Fit an Earth model model = Earth(max_degree=2, minspan_alpha=.5, smooth=True) model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Get the predicted values and derivatives y_hat = model.predict(X) y_prime_hat = model.predict_deriv(X, 'x6') # Plot true and predicted function values and derivatives # for the predictive variable plt.subplot(211) plt.plot(X[:, 6], y, 'r.') plt.plot(X[:, 6], y_hat, 'b.')
st = 'CPY012' target,start_p,stop_p,host_path=station_sel(st,mode) if mode =='hour': n_past,n_future = 24*7,72 elif mode =='day': n_past,n_future = 60,30 data = df[start_p:stop_p] data['Day'] = data.index.dayofyear #add day data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() conclude_df=pd.DataFrame() for n_out in range(1,n_future+1): X,y,xlabels = to_supervise(data,target,n_out) criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(enable_pruning = True, # max_degree=3, # max_terms=20, minspan_alpha=.5, feature_importance_type=criteria, verbose=True) model.fit(X,y,xlabels=xlabels) nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83] gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83] rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83] rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub) top20=pd.concat([rss,gcv,nbsub],ignore_index=True) top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature') top20['timestep'] = n_out #ADDED combine all result conclude_df = pd.concat([conclude_df,top20],ignore_index=True) if mode=='day':
''' Created on Feb 15, 2016 @author: jason ''' from .sklearntools import MultipleResponseEstimator, BackwardEliminationEstimatorCV, \ QuantileRegressor, ResponseTransformingEstimator from pyearth import Earth from sklearn.pipeline import Pipeline from sklearn.calibration import CalibratedClassifierCV outcomes = ['admission_rate', 'prescription_cost_rate', ''] [('earth', Earth(max_degree=2)), ('elim', BackwardEliminationEstimatorCV())]
normalized_X = preprocessing.normalize(X) # standardize the data attributes standardized_X = preprocessing.scale(X) # feature selection from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier() model.fit(X, y) # display the relative importance of each attribute print(model.feature_importances_) # naive bayes implementation from matplotlib import pyplot # create model from pyearth import Earth model = Earth() # fit the earth model model.fit(X, y) print(" Model:") print(model) # make predictions expected = y predicted = model.predict(X) # since the quality can only be a number, round all the outputs off for i in range(len(predicted)): predicted[i] = int(round(predicted[i])) # check how far the predictions are from actual values
test = context.catalog.load('test_maskedv2') = context.catalog.load('variable_descriptions_v2') sample_submission = context.catalog.load('samplesubmissionv2') #%% train.target_pct_vunerable.hvplot.kde() # %% transformer = Pipeline([('poly', PolynomialFeatures()), ('scale', StandardScaler()), ('pca', PCA(15)), ('rescale', StandardScaler())]) glm = TweedieGLM(power=0, max_iter=1000) mars = Earth() model = Pipeline([('transformer', transformer), ('model', mars)]) offset = 1e-9 def add(y): return (y/100 + offset) def subtract(y): return ((y) - offset)*100 link = Pipeline([('function', FunctionTransformer(add, subtract, validate=True))]) scorer = get_scorer('neg_root_mean_squared_error') pipeline = TransformedTargetRegressor(regressor=model, transformer=link)
def test_score(): earth = Earth(**default_params) model = earth.fit(X, y) record = model.pruning_trace() rsq = record.rsq(record.get_selected()) assert_almost_equal(rsq, model.score(X, y))
total_data = pd.concat([ total_category_data, total_numeric_data.clip(total_numeric_data.quantile(0.01).to_dict(), total_numeric_data.quantile(0.99).to_dict(), axis=1) ], axis=1) print(total_data.shape) total_data = total_data.fillna(total_data.mean()) print(total_data.head(5)) train_data = total_data[total_data.index < 1460] test_data = total_data[total_data.index >= 1460] rfe = RFE(Earth(), step=15, verbose=2).fit(train_data, train_Y) validKeys = list(train_data.columns[rfe.support_]) train_data = train_data[validKeys] test_data = test_data[validKeys] model = Earth().fit(train_data, train_Y) predict = model.predict(test_data) predict = np.exp(predict) submission = pd.DataFrame() submission['Id'] = test_index submission['SalePrice'] = predict submission.to_csv( "C:\\Users\\hongj\\Desktop\\kaggle\\house_price\\submission.csv", index=False)