def test_fit(): earth = Earth(**default_params) earth.fit(X, y) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def test_smooth(): model = Earth(penalty=1, smooth=True) model.fit(X, y) res = str(model.trace()) + '\n' + model.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def test_linvars(): earth = Earth(**default_params) earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) res = str(earth.trace()) + '\n' + earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_linvars_regress.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev)
def mars(p, xLabels, yLabel): global image_num criteria = ('rss', 'gcv', 'nb_subsets') # Randomly shuffle rows p = p.sample(frac=1).reset_index(drop=True) # Split train and test twentyPercent = -1 * round(p.shape[0] * 0.2) n = len(xLabels) xCol = p[xLabels].values.reshape(-1, n) X_train = xCol[:twentyPercent] X_test = xCol[twentyPercent:] y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1) y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1) # Fit MARS model model = Earth(feature_importance_type=criteria) model.fit(X_train, y_train) # Make predictions predicted = model.predict(X_test) r2 = r2_score(y_test, predicted) mse = mean_squared_error(y_test, predicted) predicted = predicted.reshape(-1, 1) # Plot residuals plotResiduals(y_test, predicted) # Print summary print(model.trace()) print(model.summary()) # Plot feature importances importances = model.feature_importances_ for crit in criteria: x = list(range(0, len(xLabels))) sorted_rss = [ list(t) for t in sorted(zip(importances[crit], xLabels), reverse=True) ] coeff = [] feature = [] for j in range(0, len(sorted_rss)): coeff.append(abs(sorted_rss[j][0])) feature.append(featureToLabel[sorted_rss[j][1]]) plt.clf() plt.xticks(x, feature, rotation='vertical') plt.bar(x, coeff, align='center', alpha=0.5) plt.xlabel('Features') label = "Importance (" + crit + ")" plt.ylabel(label) plt.tight_layout() label = "mars_imp_" + crit plt.show() plt.savefig(image_path.format(image_num), bbox_inches='tight') image_num += 1 return r2, mse
x = train del x['<HIGH>'] del x['<LOW>'] del x['<CLOSE>'] del x['<VOL>'] numeric_feats = x.dtypes[x.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda g: skew(g.dropna())) skewed_feats = skewed_feats.index x[skewed_feats] = np.log1p(x[skewed_feats]) # Fit MARS mars = Earth(allow_missing=True) mars.fit(x,y) print(mars.trace()) print(mars.summary()) def inverse(x): x = np.exp(x) - 1 return x def graph(x, y, y2, a, b, Title): fig = plt.figure() plt.plot(x[a:b],y[a:b],'r', label='Actual') plt.plot(x[a:b],y2[a:b],'b', label='Predicted') plt.xlabel('x') plt.ylabel('y') plt.title(Title) plt.legend(loc='upper left') plt.show()
from sklearn import preprocessing from sklearn.feature_extraction import DictVectorizer from pyearth import Earth from matplotlib import pyplot df = pd.read_excel('relay-foods.xlsx', sheetname='Purchase Data - Full Study') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df.dtypes col_names = ['OrderDate', 'PickupDate'] df = df.drop(col_names, axis=1) y = df['TotalCharges'] df_2 = df[['OrderId', 'UserId', 'PupId']] #del df['OrderDate'] X = [dict(r.iteritems()) for _, r in df_2.iterrows()] train_fea = DictVectorizer().fit_transform(X) #Fit an Earth model model = Earth() model.fit(train_fea,y) #Print the model print(model.trace()) print(model.summary()) #Plot the model y_hat = model.predict(X)
#========================================================================= # V-Function Example #========================================================================= # Create some fake data numpy.random.seed(0) m = 1000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m) # Fit an Earth model model = Earth() model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(X) pyplot.figure() pyplot.plot(X[:, 6], y, 'r.') pyplot.plot(X[:, 6], y_hat, 'b.') pyplot.xlabel('x_6') pyplot.ylabel('y') pyplot.title('Simple Earth Example') pyplot.savefig('simple_earth_example.png') #========================================================================= # Hinge plot #=========================================================================
import numpy from pyearth import Earth from matplotlib import pyplot # Create some fake data numpy.random.seed(2) m = 1000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=1) model.fit(X, y) # Print the model print model.trace() print model.summary() # Plot the model y_hat = model.predict(X) pyplot.figure() pyplot.plot(X[:, 6], y, 'r.') pyplot.plot(X[:, 6], y_hat, 'b.') pyplot.show()
def csc(df, hamming_string_dict, outdir, filename): """CRISPR Specificity Correction :param df: pandas dataframe with first column as gRNA and second column as logFC/metric :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics :param outdir: absolute filepath to output directory :param filename: name of input file to be used as part of output filename :return: CSC adjustment """ # MARS compatible file df_mars_lst = [] df_v = np.asarray(df) for i in range(len(df_v)): row_lst = [] grna, metric = df_v[i][0], df_v[i][1] try: metric = float(metric) except ValueError: sys.stdout.write( 'WARNING: encountered %s which is not float compatible, skipping\n' % metric) continue row_lst.append(grna) try: for jj in hamming_string_dict[grna]: row_lst.append(jj) row_lst.append(metric) df_mars_lst.append(row_lst) except KeyError: sys.stdout.write('\n%s not found in selected library: passing\n' % grna) continue df = pd.DataFrame(df_mars_lst, columns=[ 'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3', 'original_value' ]) # exclude infinte specificity non-target gRNAs df = df[df['h0'] != 0] # isolate pertinent confounder variables df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']] # knots knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1]) # training and testing data train_x, test_x, train_y, test_y = train_test_split(df_confounders, df['original_value'], test_size=0.10, random_state=1) # Fit an Earth model model = Earth(feature_importance_type='gcv') try: model.fit(train_x, train_y) except ValueError: sys.stdout.write( '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n' ) model_processed = 'F' sys.stdout.write( 'training input x data\n %s\ntraining input y data\n %s\n' % (train_x, train_y)) return model_processed # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances()) # Plot the model y_hat = model.predict(test_x) # calculating RMSE values rms1 = sqrt(mean_squared_error(test_y, y_hat)) print('\n\nRMSE on Predictions\n\n') print(rms1) # calculating R^2 for training print('\n\nR^2 on Training Data\n\n') print(model.score(train_x, train_y)) # calculating R^2 for testing print('\n\nR^2 on Testing Data\n\n') print(model.score(test_x, test_y)) # write out model metrics with open('%s/csc_model_metrics_%s.txt' % (outdir, filename), 'w') as outfile: outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' % (model.trace(), model.summary(), model.summary_feature_importances(), rms1)) if rms1 <= 1.0: #model processed model_processed = 'T' # full data prediction df['earth_adjustment'] = model.predict(df_confounders) # CSC correction df['earth_corrected'] = df['original_value'] - df['earth_adjustment'] # main write out df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename)) # pickle write out model_file = open( '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb') pl.dump(model, model_file) model_file.close() sys.stdout.write('\nCSC adjustment complete\n') sys.stdout.write('\nCSC output files written to %s\n' % outdir) return model_processed else: sys.stdout.write( '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n' ) model_processed = 'F' return model_processed
# test = pd.read_csv('boston_test_data.csv') # X_test = np.array(test.iloc[:, 0:13]) # X_test_id = test.iloc[:, 0] np.random.seed(0) m = 1000 n = 10 X = 80 * np.random.uniform(size=(m, n)) - 40 y = np.abs(X[:, 6] - 4.0) + 1 * np.random.normal(size=m) #Fit an Earth model model = Earth() model.fit(X, y) #Print the model print(model.trace()) print(model.summary()) X, y = load_boston(return_X_y=True) model_rsq_dic = {} # # % lower status of the population lstat_x = [] [lstat_x.append(row[12]) for row in X] lstat_x = np.array(lstat_x).reshape(-1, 1) #lstat_x = X print(lstat_x.shape) y = y.reshape(-1, 1) print(y.shape)
class TestEarth(object): def __init__(self): numpy.random.seed(0) self.basis = Basis(10) constant = ConstantBasisFunction() self.basis.append(constant) bf1 = HingeBasisFunction(constant, 0.1, 10, 1, False, 'x1') bf2 = HingeBasisFunction(constant, 0.1, 10, 1, True, 'x1') bf3 = LinearBasisFunction(bf1, 2, 'x2') self.basis.append(bf1) self.basis.append(bf2) self.basis.append(bf3) self.X = numpy.random.normal(size=(100, 10)) self.B = numpy.empty(shape=(100, 4), dtype=numpy.float64) self.basis.transform(self.X, self.B) self.beta = numpy.random.normal(size=4) self.y = numpy.empty(shape=100, dtype=numpy.float64) self.y[:] = numpy.dot( self.B, self.beta) + numpy.random.normal(size=100) self.earth = Earth(penalty=1) def test_get_params(self): assert_equal( Earth().get_params(), {'penalty': None, 'min_search_points': None, 'endspan_alpha': None, 'check_every': None, 'max_terms': None, 'max_degree': None, 'minspan_alpha': None, 'thresh': None, 'minspan': None, 'endspan': None, 'allow_linear': None, 'smooth': None}) assert_equal( Earth( max_degree=3).get_params(), {'penalty': None, 'min_search_points': None, 'endspan_alpha': None, 'check_every': None, 'max_terms': None, 'max_degree': 3, 'minspan_alpha': None, 'thresh': None, 'minspan': None, 'endspan': None, 'allow_linear': None, 'smooth': None}) @if_statsmodels def test_linear_fit(self): from statsmodels.regression.linear_model import GLS, OLS self.earth.fit(self.X, self.y) self.earth._Earth__linear_fit(self.X, self.y) soln = OLS(self.y, self.earth.transform(self.X)).fit().params assert_almost_equal(numpy.mean((self.earth.coef_ - soln) ** 2), 0.0) sample_weight = 1.0 / (numpy.random.normal(size=self.y.shape) ** 2) self.earth.fit(self.X, self.y) self.earth._Earth__linear_fit(self.X, self.y, sample_weight) soln = GLS(self.y, self.earth.transform( self.X), 1.0 / sample_weight).fit().params assert_almost_equal(numpy.mean((self.earth.coef_ - soln) ** 2), 0.0) def test_sample_weight(self): group = numpy.random.binomial(1, .5, size=1000) == 1 sample_weight = 1 / (group * 100 + 1.0) x = numpy.random.uniform(-10, 10, size=1000) y = numpy.abs(x) y[group] = numpy.abs(x[group] - 5) y += numpy.random.normal(0, 1, size=1000) model = Earth().fit(x, y, sample_weight=sample_weight) # Check that the model fits better for the more heavily weighted group assert_true(model.score(x[group], y[group]) < model.score( x[numpy.logical_not(group)], y[numpy.logical_not(group)])) # Make sure that the score function gives the same answer as the trace pruning_trace = model.pruning_trace() rsq_trace = pruning_trace.rsq(model.pruning_trace().get_selected()) assert_almost_equal(model.score(x, y, sample_weight=sample_weight), rsq_trace) # Uncomment below to see what this test situation looks like # from matplotlib import pyplot # print model.summary() # print model.score(x,y,sample_weight = sample_weight) # pyplot.figure() # pyplot.plot(x,y,'b.') # pyplot.plot(x,model.predict(x),'r.') # pyplot.show() def test_fit(self): self.earth.fit(self.X, self.y) res = str(self.earth.trace()) + '\n' + self.earth.summary() # fl.write(res) filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev) def test_smooth(self): model = Earth(penalty=1, smooth=True) model.fit(self.X, self.y) res = str(model.trace()) + '\n' + model.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_regress_smooth.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev) def test_linvars(self): self.earth.fit(self.X, self.y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) res = str(self.earth.trace()) + '\n' + self.earth.summary() filename = os.path.join(os.path.dirname(__file__), 'earth_linvars_regress.txt') with open(filename, 'r') as fl: prev = fl.read() assert_equal(res, prev) def test_score(self): model = self.earth.fit(self.X, self.y) record = model.pruning_trace() rsq = record.rsq(record.get_selected()) assert_almost_equal(rsq, model.score(self.X, self.y)) @if_pandas @if_environ_has('test_pathological_cases') def test_pathological_cases(self): import pandas directory = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'pathological_data') cases = {'issue_44': {}, 'issue_50': {'penalty': 0.5, 'minspan': 1, 'allow_linear': False, 'endspan': 1, 'check_every': 1, 'sample_weight': 'issue_50_weight.csv'}} for case, settings in cases.iteritems(): data = pandas.read_csv(os.path.join(directory, case + '.csv')) y = data['y'] del data['y'] X = data if 'sample_weight' in settings: filename = os.path.join(directory, settings['sample_weight']) sample_weight = pandas.read_csv(filename)['sample_weight'] del settings['sample_weight'] else: sample_weight = None model = Earth(**settings) model.fit(X, y, sample_weight=sample_weight) with open(os.path.join(directory, case + '.txt'), 'r') as infile: correct = infile.read() assert_equal(model.summary(), correct) @if_pandas def test_pandas_compatibility(self): import pandas X = pandas.DataFrame(self.X) y = pandas.DataFrame(self.y) colnames = ['xx' + str(i) for i in range(X.shape[1])] X.columns = colnames model = self.earth.fit(X, y) assert_list_equal( colnames, model.forward_trace()._getstate()['xlabels']) @if_patsy @if_pandas def test_patsy_compatibility(self): import pandas import patsy X = pandas.DataFrame(self.X) y = pandas.DataFrame(self.y) colnames = ['xx' + str(i) for i in range(X.shape[1])] X.columns = colnames X['y'] = y y, X = patsy.dmatrices( 'y ~ xx0 + xx1 + xx2 + xx3 + xx4 + xx5 + xx6 + xx7 + xx8 + xx9 - 1', data=X) model = self.earth.fit(X, y) assert_list_equal( colnames, model.forward_trace()._getstate()['xlabels']) def test_pickle_compatibility(self): model = self.earth.fit(self.X, self.y) model_copy = pickle.loads(pickle.dumps(model)) assert_true(model_copy == model) assert_true( numpy.all(model.predict(self.X) == model_copy.predict(self.X))) assert_true(model.basis_[0] is model.basis_[1]._get_root()) assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root()) def test_copy_compatibility(self): model = self.earth.fit(self.X, self.y) model_copy = copy.copy(model) assert_true(model_copy == model) assert_true( numpy.all(model.predict(self.X) == model_copy.predict(self.X))) assert_true(model.basis_[0] is model.basis_[1]._get_root()) assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())