Ejemplo n.º 1
0
def test_export_python_string():
    for smooth in (True, False):
        model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y)
        export_model = export_python_string(model, 'my_test_model')
        six.exec_(export_model, globals())
        for exp_pred, model_pred in zip(model.predict(X), my_test_model(X)):
            assert_almost_equal(exp_pred, model_pred)
Ejemplo n.º 2
0
def test_pathological_cases():
    import pandas
    directory = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'pathological_data')
    cases = {'issue_44': {},
             'issue_50': {'penalty': 0.5,
                          'minspan': 1,
                          'allow_linear': False,
                          'endspan': 1,
                          'check_every': 1,
                          'sample_weight': 'issue_50_weight.csv'}}
    for case, settings in cases.iteritems():
        data = pandas.read_csv(os.path.join(directory, case + '.csv'))
        y = data['y']
        del data['y']
        X = data
        if 'sample_weight' in settings:
            filename = os.path.join(directory, settings['sample_weight'])
            sample_weight = pandas.read_csv(filename)['sample_weight']
            del settings['sample_weight']
        else:
            sample_weight = None
        model = Earth(**settings)
        model.fit(X, y, sample_weight=sample_weight)
        with open(os.path.join(directory, case + '.txt'), 'r') as infile:
            correct = infile.read()
        assert_equal(model.summary(), correct)
Ejemplo n.º 3
0
def test_copy_compatibility():
    model = Earth(**default_params).fit(X, y)
    model_copy = copy.copy(model)
    assert_true(model_copy == model)
    assert_true(
        numpy.all(model.predict(X) == model_copy.predict(X)))
    assert_true(model.basis_[0] is model.basis_[1]._get_root())
    assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())
Ejemplo n.º 4
0
def test_smooth():
        model = Earth(penalty=1, smooth=True)
        model.fit(X, y)
        res = str(model.trace()) + '\n' + model.summary()
        filename = os.path.join(os.path.dirname(__file__),
                                'earth_regress_smooth.txt')
        with open(filename, 'r') as fl:
            prev = fl.read()
        assert_equal(res, prev)
Ejemplo n.º 5
0
def test_pickle_compatibility():
    earth = Earth(**default_params)
    model = earth.fit(X, y)
    model_copy = pickle.loads(pickle.dumps(model))
    assert_true(model_copy == model)
    assert_true(
        numpy.all(model.predict(X) == model_copy.predict(X)))
    assert_true(model.basis_[0] is model.basis_[1]._get_root())
    assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())
Ejemplo n.º 6
0
def test_fit():
    earth = Earth(**default_params)
    earth.fit(X, y)
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress.txt')
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_equal(res, prev)
Ejemplo n.º 7
0
def run_pyearth(X, y, **kwargs):
    '''Run with pyearth.  Return prediction value, training time, and number of forward pass iterations.'''
    model = Earth(**kwargs)
    t0 = time.time()
    model.fit(X, y)
    t1 = time.time()
    y_pred = model.predict(X)
    forward_iterations = len(model.forward_trace()) - 1
    return y_pred, t1 - t0, forward_iterations
Ejemplo n.º 8
0
def test_exhaustive_search():
    model = Earth(max_terms=13,
                  enable_pruning=False,
                  check_every=1,
                  thresh=0,
                  minspan=1,
                  endspan=1)
    model.fit(X, y)
    assert_equal(model.basis_.plen(), model.coef_.shape[1])
    assert_equal(model.transform(X).shape[1], len(model.basis_))
Ejemplo n.º 9
0
def test_nb_terms():

    for max_terms in (1, 3, 12, 13):
        model = Earth(max_terms=max_terms)
        model.fit(X, y)
        assert_true(len(model.basis_) <= max_terms)
        assert_true(len(model.coef_) <= len(model.basis_))
        assert_true(len(model.coef_) >= 1)
        if max_terms == 1:
            assert_list_almost_equal_value(model.predict(X), y.mean())
Ejemplo n.º 10
0
def test_feature_importance():
    criteria = ('rss', 'gcv', 'nb_subsets')
    for imp in criteria:
        earth = Earth(feature_importance_type=imp, **default_params)
        earth.fit(X, y)
        assert len(earth.feature_importances_) == X.shape[1]
    earth = Earth(feature_importance_type=criteria, **default_params)
    earth.fit(X, y)
    assert type(earth.feature_importances_) == dict
    assert set(earth.feature_importances_.keys()) == set(criteria)
    for crit, val in earth .feature_importances_.items():
        assert len(val) == X.shape[1]

    assert_raises(
            ValueError,
            Earth(feature_importance_type='bad_name', **default_params).fit,
            X, y)

    earth = Earth(feature_importance_type=('rss',), **default_params)
    earth.fit(X, y)
    assert len(earth.feature_importances_) == X.shape[1]

    assert_raises(
            ValueError,
            Earth(feature_importance_type='rss', enable_pruning=False, **default_params).fit,
            X, y)
Ejemplo n.º 11
0
def test_fit():
    earth = Earth(**default_params)
    earth.fit(X, y)
    res = str(earth.rsq_)
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_true(abs(float(res) - float(prev)) < .05)
Ejemplo n.º 12
0
def test_pandas_compatibility():
    import pandas
    X_df = pandas.DataFrame(X)
    y_df = pandas.DataFrame(y)
    colnames = ['xx' + str(i) for i in range(X.shape[1])]
    X_df.columns = colnames

    earth = Earth(**default_params)
    model = earth.fit(X_df, y_df)
    assert_list_equal(
        colnames, model.forward_trace()._getstate()['xlabels'])
Ejemplo n.º 13
0
def test_smooth():
    model = Earth(penalty=1, smooth=True)
    model.fit(X, y)
    res = str(model.rsq_)
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_smooth.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_true(abs(float(res) - float(prev)) < .05)
Ejemplo n.º 14
0
def test_linvars():
    earth = Earth(**default_params)
    earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_linvars_regress.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()

    assert_equal(res, prev)
Ejemplo n.º 15
0
def test_untrained():

    model = Earth(**default_params)
    assert_raises(NotFittedError, model.predict, X)
    assert_raises(NotFittedError, model.transform, X)
    assert_raises(NotFittedError, model.predict_deriv, X)
    assert_raises(NotFittedError, model.score, X)

    # the following should be changed to raise NotFittedError
    assert_equal(model.forward_trace(), None)
    assert_equal(model.pruning_trace(), None)
    assert_equal(model.summary(), "Untrained Earth Model")
Ejemplo n.º 16
0
def runModel(i,featureCombo):
    mae = np.array([])   
    logging.warning('try alpha = %s' % i)
    for ktrain,ktest in kf:
        x = trainCleaned.iloc[ktrain,]
        y = trainCleaned.iloc[ktest,]    
        model = Earth()
        model.fit(x[featureCombo],x['Expected'])
	pred = model.predict(y[featureCombo])
        mae = np.append(mae,(getMAE(pred,y['Expected'])))
    logging.warning('average 10-fold MAE for alpha %s feature %s' % (i,featureCombo))
    logging.warning(mae.mean())
Ejemplo n.º 17
0
def test_nb_degrees():
    for max_degree in (1, 2, 12, 13):
        model = Earth(max_terms=10,
                      max_degree=max_degree,
                      enable_pruning=False,
                      check_every=1,
                      thresh=0,
                      minspan=1,
                      endspan=1)
        model.fit(X, y)
        for basis in model.basis_:
            assert_true(basis.degree() >= 0)
            assert_true(basis.degree() <= max_degree)
Ejemplo n.º 18
0
def test_missing_data():
    earth = Earth(allow_missing=True, **default_params)
    missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool)
    X_ = X.copy()
    X_[missing_] = None
    earth.fit(X_, y)
    res = str(earth.score(X_, y))
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_missing_data.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_true(abs(float(res) - float(prev)) < .03)
Ejemplo n.º 19
0
def test_eq():
    model1 = Earth(**default_params)
    model2 = Earth(**default_params)
    assert_equal(model1, model2)
    assert_not_equal(model1, 5)

    params = {}
    params.update(default_params)
    params["penalty"] = 15
    model2 = Earth(**params)
    assert_not_equal(model1, model2)

    model3 = Earth(**default_params)
    model3.unknown_parameter = 5
    assert_not_equal(model1, model3)
Ejemplo n.º 20
0
def test_output_weight():
    x = numpy.random.uniform(-1, 1, size=(1000, 1))
    y = (numpy.dot(x, numpy.random.normal(0, 1, size=(1, 10)))) ** 5 + 1
    y = (y - y.mean(axis=0)) / y.std(axis=0)
    group = numpy.array([1] * 5 + [0] * 5)
    output_weight = numpy.array([1] * 5 + [2] * 5, dtype=float)
    model = Earth().fit(x, y, output_weight=output_weight)

    # Check that the model fits at least better
    # the more heavily weighted group
    mse = ((model.predict(x) - y)**2).mean(axis=0)
    group1_mean = mse[group].mean()
    group2_mean = mse[numpy.logical_not(group)].mean()
    assert_true(group1_mean > group2_mean or
                round(abs(group1_mean - group2_mean), 7) == 0)
Ejemplo n.º 21
0
def test_patsy_compatibility():
    import pandas
    import patsy
    X_df = pandas.DataFrame(X)
    y_df = pandas.DataFrame(y)
    colnames = ['xx' + str(i) for i in range(X.shape[1])]
    X_df.columns = colnames
    X_df['y'] = y
    y_df, X_df = patsy.dmatrices(
        'y ~ xx0 + xx1 + xx2 + xx3 + xx4 + xx5 + xx6 + xx7 + xx8 + xx9 - 1',
        data=X_df)

    model = Earth(**default_params).fit(X_df, y_df)
    assert_list_equal(
        colnames, model.forward_trace()._getstate()['xlabels'])
Ejemplo n.º 22
0
def test_sparse():
    X_sparse = csr_matrix(X)

    model = Earth(**default_params)
    assert_raises(TypeError, model.fit, X_sparse, y)

    model = Earth(**default_params)
    model.fit(X, y)
    assert_raises(TypeError, model.predict, X_sparse)
    assert_raises(TypeError, model.predict_deriv, X_sparse)
    assert_raises(TypeError, model.transform, X_sparse)
    assert_raises(TypeError, model.score, X_sparse)

    model = Earth(**default_params)
    sample_weight = csr_matrix([1.] * X.shape[0])
    assert_raises(TypeError, model.fit, X, y, sample_weight)
Ejemplo n.º 23
0
 def __init__(self, maxp=100):
     self.nump = 0
     self.maxp = maxp
     self.x = None     # pylint: disable=invalid-name
     self.fx = None
     self.dim = None
     self.model = Earth()
     self.updated = False
Ejemplo n.º 24
0
def getTrain(trainData, testData):

    size_s = len(trainData)
    size_t = len(testData)
    lenY = len(testData[0])



    X = numpy.zeros((size_s,lenY-1))
    Y = numpy.zeros((size_s,1))

    z = 0

    for d in trainData:
        for j in range(lenY-1):
            X[z][j] = d[j]
        Y[z][0] = float(d[lenY-1])
        z += 1

    z = 0
    dX = numpy.zeros((size_t,lenY-1))

    for d in testData:
        for j in range(lenY-1):
            dX[z][j] = d[j]
        z += 1

    model = Earth()
    model.fit(X,Y)


    y_hat = model.predict(dX)

    corrent = 0

    for i in range(size_t):
        x1 = testData[i][lenY-1]
        x2 = y_hat[i]

        if x1 * x2 >= 0:
            corrent += 1
    return corrent
Ejemplo n.º 25
0
def test_export_sympy():
    import pandas as pd
    from sympy.utilities.lambdify import lambdify
    from sympy.printing.lambdarepr import NumPyPrinter

    class PyEarthNumpyPrinter(NumPyPrinter):
        def _print_Max(self, expr):
            return 'maximum(' + ','.join(self._print(i) for i in expr.args) + ')'

        def _print_NaNProtect(self, expr):
            return 'where(isnan(' + ','.join(self._print(a) for a in expr.args) + '), 0, ' \
                + ','.join(self._print(a) for a in expr.args) + ')'

        def _print_Missing(self, expr):
            return 'isnan(' + ','.join(self._print(a) for a in expr.args) + ').astype(float)'

    for smooth, n_cols, allow_missing in product((True, False), (1, 2), (True, False)):
        X_df = pd.DataFrame(X.copy(), columns=['x_%d' % i for i in range(X.shape[1])])
        y_df = pd.DataFrame(Y[:, :n_cols])
        if allow_missing:
            # Randomly remove some values so that the fitted model contains MissingnessBasisFunctions
            X_df['x_1'][numpy.random.binomial(n=1, p=.1, size=X_df.shape[0]).astype(bool)] = numpy.nan

        model = Earth(allow_missing=allow_missing, smooth=smooth, max_degree=2).fit(X_df, y_df)
        expressions = export_sympy(model) if n_cols > 1 else [export_sympy(model)]
        module_dict = {'select': numpy.select, 'less_equal': numpy.less_equal, 'isnan': numpy.isnan,
                       'greater_equal':numpy.greater_equal, 'logical_and': numpy.logical_and, 'less': numpy.less,
                       'logical_not':numpy.logical_not, "greater": numpy.greater, 'maximum':numpy.maximum,
                       'Missing': lambda x: numpy.isnan(x).astype(float),
                       'NaNProtect': lambda x: numpy.where(numpy.isnan(x), 0, x), 'nan': numpy.nan,
                       'float': float, 'where': numpy.where
                       }

        for i, expression in enumerate(expressions):
            # The lambdified functions for smoothed basis functions only work with modules='numpy' and
            # for regular basis functions with modules={'Max':numpy.maximum}.  This is a confusing situation
            func = lambdify(X_df.columns, expression, printer=PyEarthNumpyPrinter, modules=module_dict)
            y_pred_sympy = func(*[X_df.loc[:,var] for var in X_df.columns])

            y_pred = model.predict(X_df)[:,i] if n_cols > 1 else model.predict(X_df)
            assert_array_almost_equal(y_pred, y_pred_sympy)
Ejemplo n.º 26
0
def test_untrained():
    # NotFittedError moved from utils.validation to exceptions
    # some time after 0.17.1
    try:
        from sklearn.exceptions import NotFittedError
    except ImportError:
        from sklearn.utils.validation import NotFittedError
    
    # Make sure calling methods that require a fitted Earth object
    # raises the appropriate exception when using a not yet fitted 
    # Earth object
    model = Earth(**default_params)
    assert_raises(NotFittedError, model.predict, X)
    assert_raises(NotFittedError, model.transform, X)
    assert_raises(NotFittedError, model.predict_deriv, X)
    assert_raises(NotFittedError, model.score, X)

    # the following should be changed to raise NotFittedError
    assert_equal(model.forward_trace(), None)
    assert_equal(model.pruning_trace(), None)
    assert_equal(model.summary(), "Untrained Earth Model")
Ejemplo n.º 27
0
def test_xlabels():

    model = Earth(**default_params)
    assert_raises(ValueError, model.fit, X[:, 0:5], y, xlabels=['var1', 'var2'])

    model = Earth(**default_params)
    model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3'])

    model = Earth(**default_params)
    model.fit(X[:, 0:3], y, xlabels=['var1', 'var2', 'var3'])
Ejemplo n.º 28
0
 def marsmodelorr(self, use_smY=True, slope_trunc=0.00001, savgol_window=151, savgol_order=3, ex_order=51):
     Xf, Yf = self.Xf_, self.Yf_
     X, Y = self.X_, self.Y_
     fom = {}
     # smooth the data
     smY = savgol(Y, savgol_window, savgol_order)
     # perform mars
     model = MARS()
     if use_smY:
         model.fit(X, smY)
     else:
         model.fit(X, Y)
     Y_h = model.predict(X)
     '''
     calculate dydx based on mars model to get knots and intercepts as this is 
     complicated to extract from hinge functions
     '''
     diff1 = np.diff(Y_h) / np.diff(X)
     tdiff1 = diff1 - np.nanmin(diff1)
     tdiff1 = tdiff1 / np.nanmax(tdiff1)
     #calculate slopes of linear segments
     ID = [i for i in range(1, len(tdiff1)) if np.abs(tdiff1[i] - tdiff1[i - 1]) > slope_trunc]
     ID.insert(0, 0)
     ID.append(np.argmax(X))  # this might cause an error
     slopes = [np.nanmean(diff1[ID[i - 1]:ID[i]]) for i in range(1, len(ID) - 1)]
     a = [Y_h[ID[i]] - slopes[i] * X[ID[i]] for i in range(len(ID) - 2)]
     IDM, IDm = np.argmax(slopes), np.argmin(np.abs(slopes))
     # intercept of highest slope and zero as well as highest slope and lowest slope
     fom['zinter'] = -a[IDM] / slopes[IDM]
     fom['lminter'] = (a[IDM] - a[IDm]) / (slopes[IDm] - slopes[IDM])
     fom['max_slope'] = slopes[IDM]
     fom['curr_lminter_model'] = fom['lminter'] * slopes[IDM] + a[IDM]
     fom['curr_lminter_data'] = np.mean(Y[np.where(np.abs(X - fom['lminter']) < 0.5)[0]])
     # calculate how the CV curves kight look like without the 'ORR part'
     srYs = smY - model.predict(X)
     srYf = savgol(Yf - model.predict(Xf), savgol_window, savgol_order)
     # calculate their derivative
     dsrYf = savgol(np.diff(srYf) / np.diff(Xf), savgol_window, savgol_order)
     # find the extrema in the derivatives for extraction of redox pots
     redID_f = argrelextrema(srYf, np.less, order=ex_order)
     oxID_f = argrelextrema(srYf, np.greater, order=ex_order)
     # calc some more foms like position of redox waves
     fom['redpot_f'], fom['redpot_f_var'] = np.nanmean(Xf[redID_f]), np.nanstd(Xf[redID_f])
     fom['oxpot_f'], fom['oxpot_f_var'] = np.nanmean(Xf[oxID_f]), np.nanstd(Xf[oxID_f])
     fom['X'], fom['Xf'] = X, Xf
     fom['srYs'], fom['srYf'], fom['smY'] = srYs, srYf, smY
     fom['Y'], fom['Yf'], fom['Y_h'] = Y, Yf, Y_h
     fom['noise_lvl'] = np.sum((Y_h - Y) ** 2, axis=0)
     self.fom = fom
Ejemplo n.º 29
0
    def __init__(self, dim):
        self.num_pts = 0
        self.X = np.empty([0, dim])
        self.fX = np.empty([0, 1])
        self.dim = dim
        self.updated = False

        try:
            from pyearth import Earth
            self.model = Earth()
        except ImportError as err:
            print("Failed to import pyearth")
            raise err
Ejemplo n.º 30
0
def test_fast():
    earth = Earth(max_terms=10,
                  max_degree=5,
                  **default_params)
    earth.fit(X, y)
    normal_summary = earth.summary()
    earth = Earth(use_fast=True,
                  max_terms=10,
                  max_degree=5,
                  fast_K=10,
                  fast_h=1,
                  **default_params)
    earth.fit(X, y)
    fast_summary = earth.summary()
    assert_equal(normal_summary, fast_summary)
Ejemplo n.º 31
0
def test_deriv():

    model = Earth(**default_params)
    model.fit(X, y)
    assert_equal(X.shape + (1, ), model.predict_deriv(X).shape)
    assert_equal((X.shape[0], 1, 1), model.predict_deriv(X, variables=0).shape)
    assert_equal((X.shape[0], 1, 1),
                 model.predict_deriv(X, variables='x0').shape)
    assert_equal((X.shape[0], 3, 1),
                 model.predict_deriv(X, variables=[1, 5, 7]).shape)
    assert_equal((X.shape[0], 0, 1),
                 model.predict_deriv(X, variables=[]).shape)

    res_deriv = model.predict_deriv(X, variables=['x2', 'x7', 'x0', 'x1'])
    assert_equal((X.shape[0], 4, 1), res_deriv.shape)

    res_deriv = model.predict_deriv(X, variables=['x0'])
    assert_equal((X.shape[0], 1, 1), res_deriv.shape)

    assert_equal((X.shape[0], 1, 1),
                 model.predict_deriv(X, variables=[0]).shape)
Ejemplo n.º 32
0
    model = Earth(max_terms=50, max_degree=3)
    model.fit(X,y)

    #Print the model
    #print(model.trace())
    print(model.summary())


    print "MARS  degree 5"

    model = Earth(max_terms=20, max_degree=5)
    model.fit(X,y)

    #Print the model
    #print(model.trace())
    print(model.summary())
   
    """

    print "====================================="

    print "MARS  degree 1"
    model = Earth(max_terms=70, max_degree=1)
    print "Score: {}".format ( crossValidation ( model, X, y ) )

    print "MARS  degree 3"
    model = Earth(max_terms=50, max_degree=3)
    crossValidation ( model, X, y )
    print "Score: {}".format ( crossValidation ( model, X, y ) )
Ejemplo n.º 33
0
X = np.array(X)
y = np.sin(X) + np.random.normal(size=X.shape[0])/10.0  

#Defining different knots which will be used as a parameter for MARS model
knots = [2,4,5,10]

#Helpful in creating graph
axis = [[0,0],[0,1],[1,0],[1,1]]

#Defining different max_degree parameter for MARS model parameter
for degree in range(1,5):
  fig,ax = plt.subplots(2,2,figsize=(10, 10))

  for num_knot in range(4):
    # Defining MARS model with max_term and max_degree parameter
    model = Earth(max_terms=knots[num_knot],max_degree=degree,verbose=0)
    
    #Fitting the dataset on the dataset
    model.fit(X, y)

    #Prediction model output
    y_hat = model.predict(X)

    #Potting graphs
    ax[axis[num_knot][0],axis[num_knot][1]].title.set_text(f"degree = {degree}, knots = {knots[num_knot]}")
    ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y,'r.')
    ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y_hat,'b.')
  plt.show()

# Plotting dataset distribution
plt.figure()
Ejemplo n.º 34
0
def translation_correction(cell_mesh, cell_mesh_2, buffer_cell,\
	x_pos, y_pos, z_pos, x_pos_new, y_pos_new, z_pos_new, closest_no_conflict, directory ):
	
	x_min = np.min([np.min(cell_mesh[:,0]),np.min(cell_mesh_2[:,0])]) - buffer_cell 
	x_max = np.max([np.max(cell_mesh[:,0]),np.max(cell_mesh_2[:,0])]) + buffer_cell
	y_min = np.min([np.min(cell_mesh[:,1]),np.min(cell_mesh_2[:,1])]) - buffer_cell
	y_max = np.max([np.max(cell_mesh[:,1]),np.max(cell_mesh_2[:,1])]) + buffer_cell
	z_min = np.min([np.min(cell_mesh[:,2]),np.min(cell_mesh_2[:,2])]) - buffer_cell
	z_max = np.max([np.max(cell_mesh[:,2]),np.max(cell_mesh_2[:,2])]) + buffer_cell
	
	num_pts = len(x_pos)
	X = []; Y = []; Z = []; U = []; V = []; W = [] 
	for kk in range(0,num_pts):
		idx = closest_no_conflict[kk]
		if idx < len(closest_no_conflict):
			U.append(x_pos_new[idx] - x_pos[kk])
			V.append(y_pos_new[idx] - y_pos[kk])
			W.append(z_pos_new[idx] - z_pos[kk])
			X.append(x_pos_new[idx]); Y.append(y_pos_new[idx]); Z.append(z_pos_new[idx])
	
	# --> limit to points that aren't too close to the cell 
	X_safe = []; Y_safe = []; Z_safe = []; U_safe = []; V_safe = []; W_safe = [] 
	num_pts = len(U)
	for kk in range(0,num_pts):
		x_out = X[kk] < x_min or X[kk] > x_max
		y_out = Y[kk] < y_min or Y[kk] > y_max
		z_out = Z[kk] < z_min or Z[kk] > z_max
		if x_out or y_out or z_out:
			X_safe.append(X[kk])
			Y_safe.append(Y[kk])
			Z_safe.append(Z[kk])
			U_safe.append(U[kk])
			V_safe.append(V[kk])
			W_safe.append(W[kk])

	X_safe = np.asarray(X_safe); Y_safe = np.asarray(Y_safe); Z_safe = np.asarray(Z_safe)
	U_safe = np.asarray(U_safe); V_safe = np.asarray(V_safe); W_safe = np.asarray(W_safe)
	
	# --> fit MARS models 
	model_U = Earth(max_degree=2,max_terms=10)
	model_U.fit(Z_safe,U_safe)
	model_V = Earth(max_degree=2,max_terms=10)
	model_V.fit(Z_safe,V_safe)
	model_W = Earth(max_degree=2,max_terms=10)
	model_W.fit(Z_safe,W_safe)
		
	# --> re-define Z 
	pred_U = model_U.predict(z_pos_new)
	pred_V = model_V.predict(z_pos_new)
	pred_W = model_W.predict(z_pos_new)
	
	# --> correct new bead positions 
	for kk in range(0,len(x_pos_new)):
		x_pos_new[kk] = x_pos_new[kk] - pred_U[kk] 
		y_pos_new[kk] = y_pos_new[kk] - pred_V[kk]
		z_pos_new[kk] = z_pos_new[kk] - pred_W[kk] 
	
	# --> correct new cell position 
	pred_cell_0 = model_U.predict(cell_mesh_2[:,0])
	pred_cell_1 = model_V.predict(cell_mesh_2[:,1])
	pred_cell_2 = model_W.predict(cell_mesh_2[:,2])
	
	cell_mesh_2_new = np.zeros(cell_mesh_2.shape)
	cell_mesh_2_new[:,0] = cell_mesh_2[:,0] - pred_cell_0
	cell_mesh_2_new[:,1] = cell_mesh_2[:,1] - pred_cell_1
	cell_mesh_2_new[:,2] = cell_mesh_2[:,2] - pred_cell_2
	
	# --> plot MARS models 
	Z_line = np.linspace(np.min(Z),np.max(Z),100)
	pred_line_U = model_U.predict(Z_line)
	pred_line_V = model_V.predict(Z_line)
	pred_line_W = model_W.predict(Z_line)
	
	plt.figure(figsize=(15,5))
	plt.subplot(1,3,1)
	plt.plot(Z,U,'b.',label='x raw')
	plt.plot(Z_line,pred_line_U,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('x displacements')
	plt.subplot(1,3,2)
	plt.plot(Z,V,'r.',label='y raw')
	plt.plot(Z_line,pred_line_V,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('y displacements')
	plt.subplot(1,3,3)
	plt.plot(Z,W,'g.',label='z raw')
	plt.plot(Z_line,pred_line_W,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('z displacements')
	plt.savefig(directory + '/translation_correction.png')
	
	return x_pos_new, y_pos_new, z_pos_new, cell_mesh_2_new 
Ejemplo n.º 35
0
test = pd.read_csv('../data/modeltest.csv',index_col=0)
label = train['Response'].values


featextra= pd.read_csv('../feat/improve.csv',index_col=0)
train = pd.concat([train,featextra.loc[train.index]],axis=1)
test = pd.concat([test,featextra.loc[test.index]],axis=1)

featextra= pd.read_csv('../feat/duplicate.csv',index_col=0)
train = pd.concat([train,featextra.loc[train.index]],axis=1)
test = pd.concat([test,featextra.loc[test.index]],axis=1)


feat = train.columns.drop('Response',1)
#Build an Earth model with a logisticregression pipeline
earth_pipe = Pipeline([('earth',Earth(use_fast=True,allow_missing=True,penalty=0.5,max_degree=3)),('log',LogisticRegression())])
earth_pipe.fit(train[feat],label)

#Parameter tuning

#param_grid = {'earth__penalty': np.arange(1,11,2),'earth__max_degree': range(1,4)}
#
#gs1 = GridSearchCV(earth_pipe,param_grid,n_jobs=1,pre_dispatch=1,cv=StratifiedKFold(label, n_folds=5, shuffle=True),scoring='log_loss',verbose=2)
#
#
#gs1.fit(train[feat],label)
#
#print gs1.best_params_
#print gs1.best_score_
#
##----------------------------------------------------------
Ejemplo n.º 36
0
def fit_with_subdata(scaledown, offset):
  # retrieve training data and official reco hadronic energy for comparison
  X = root2array('../training_data.root',
                 branches='calehad',
                 selection='mustopz<1275&&isnumucc==1',
                 step=scaledown, start=offset).reshape(-1,1)
  recoemu_official = root2array('../training_data.root', branches='recoemu',
                                selection='mustopz<1275&&isnumucc==1',
                                step=scaledown, start=offset)
  trueenu = root2array('../training_data.root', branches='trueenu',
                       selection='mustopz<1275&&isnumucc==1',
                       step=scaledown, start=offset)
  y = trueenu - recoemu_official
  yoff = root2array('../training_data.root', branches='recoehad',
                    selection='mustopz<1275&&isnumucc==1',
                    step=scaledown, start=offset)
  
  # train svm with standardized regressors
  mars = Earth()
  mars.fit(X, y)
  
  # save the model
  os.system('mkdir -p models/1d')
  modelpn = 'models/1d/hadronic_1d_energy_estimator_step{}offset{}.pkl'.format(scaledown, offset)
  joblib.dump(mars, modelpn)
  
  # estimate reco value
  yest = mars.predict(X)
  rest = (yest-y)/y
  roff = (yoff-y)/y
  
  # save root file
  os.system('mkdir -p output_root_files/1d')
  toutf = TFile('output_root_files/1d/resolution_1d_step{}offset{}.root'.format(scaledown, offset), 'recreate')
  tr = TTree( 'tr', 'resolution tree' )
  r1 = array( 'f', [ 0. ] )
  r2 = array( 'f', [ 0. ] )
  marsehad = array( 'f', [ 0. ] )
  offehad = array( 'f', [ 0. ] )
  trueehad = array( 'f', [ 0. ] )
  tr.Branch( 'rest', r1, 'rest/F' )
  tr.Branch( 'roff', r2, 'roff/F' )
  tr.Branch('marsehad', marsehad, 'marsehad/F')
  tr.Branch('offehad', offehad, 'offehad/F')
  tr.Branch('trueehad', trueehad, 'trueehad/F')
  for i in range(len(rest)):
    r1[0] = rest[i]
    r2[0] = roff[i]
    marsehad[0] = yest[i]
    offehad[0] = yoff[i]
    trueehad[0] = y[i]
    tr.Fill()
  tr.Write()
  toutf.Close()
  
  # print out the statistics
  os.system('mkdir -p performance_figures/1d')
  with open('performance_figures/1d/1d_step{}offset{}.txt'.format(scaledown, offset), 'w') as outf:
    outf.write(str(np.mean(rest))+'\n')
    outf.write(str(tstd(rest))+'\n')
    outf.write(str(skew(rest))+'\n')
    outf.write(str(kurtosis(rest))+'\n')
Ejemplo n.º 37
0
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outElasticNet.csv')

regr_2.fit(X_train, y_train)
y_eval = regr_2.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions'
                                   ]).to_csv('outAdaBoostRegressor.csv')

clf = linear_model.Lars(n_nonzero_coefs=1)
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outLARS.csv')
"""
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outAdaBoostRegressor.csv')
"""

from pyearth import Earth

clf = Earth()
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outMARS.csv')
Ejemplo n.º 38
0
class MARSInterpolant(Surrogate):
    """Compute and evaluate a MARS interpolant

    MARS builds a model of the form

    .. math::

        \\hat{f}(x) = \\sum_{i=1}^{k} c_i B_i(x).

    The model is a weighted sum of basis functions :math:`B_i(x)`. Each basis
    function :math:`B_i(x)` takes one of the following three forms:

    1. a constant 1.
    2. a hinge function of the form :math:`\\max(0, x - const)` or \
       :math:`\\max(0, const - x)`. MARS automatically selects variables \
       and values of those variables for knots of the hinge functions.
    3. a product of two or more hinge functions. These basis functions c \
       an model interaction between two or more variables.

    :param dim: Number of dimensions
    :type dim: int
    :param lb: Lower variable bounds
    :type lb: numpy.array
    :param ub: Upper variable bounds
    :type ub: numpy.array
    :param output_transformation: Transformation applied to values before fitting
    :type output_transformation: Callable

    :ivar dim: Number of dimensions
    :ivar lb: Lower variable bounds
    :ivar ub: Upper variable bounds
    :ivar output_transformation: Transformation to apply to function values before fitting
    :ivar num_pts: Number of points in surrogate model
    :ivar X: Point incorporated in surrogate model (num_pts x dim)
    :ivar fX: Function values in surrogate model (num_pts x 1)
    :ivar updated: True if model is up-to-date (no refit needed)
    :ivar model: Earth object
    """

    def __init__(self, dim, lb, ub, output_transformation=None):
        super().__init__(dim=dim, lb=lb, ub=ub, output_transformation=output_transformation)

        try:
            from pyearth import Earth

            self.model = Earth()
        except ImportError as err:
            print("Failed to import pyearth")
            raise err

    def _fit(self):
        """Compute new coefficients if the MARS interpolant is not updated."""
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")  # Surpress deprecation warnings
            if self.updated is False:
                fX = self.output_transformation(self.fX.copy())
                self.model.fit(self._X, fX)
                self.updated = True

    def predict(self, xx):
        """Evaluate the MARS interpolant at the points xx

        :param xx: Prediction points, must be of size num_pts x dim or (dim, )
        :type xx: numpy.ndarray

        :return: Prediction of size num_pts x 1
        :rtype: numpy.ndarray
        """
        self._fit()
        xx = to_unit_box(np.atleast_2d(xx), self.lb, self.ub)
        return np.expand_dims(self.model.predict(xx), axis=1)

    def predict_deriv(self, xx):
        """Evaluate the derivative of the MARS interpolant at points xx

        :param xx: Prediction points, must be of size num_pts x dim or (dim, )
        :type xx: numpy.array

        :return: Derivative of the RBF interpolant at xx
        :rtype: numpy.array
        """
        self._fit()
        xx = to_unit_box(np.atleast_2d(xx), self.lb, self.ub)
        dfx = self.model.predict_deriv(xx, variables=None)
        return dfx[0] / (self.ub - self.lb)
Ejemplo n.º 39
0
 def fit(self):
     self.classifier = Earth()
     self.classifier.fit(self.x_train, self.y_train)
Ejemplo n.º 40
0
olsMSE = kFoldValidation(5, ols, array_x_train, array_y_train)
olsMSE
# array([2.7911842 , 2.76834881, 2.84893447, 2.78335565, 2.73966849])

# OLS using only the 14 top correlated variables
sub_x_train = x_train[top[1:]]
array_x_sub = np.array(sub_x_train)
olsMSE = kFoldValidation(5, ols, array_x_sub, array_y_train)
olsMSE
# array([2.82931072, 2.80517518, 2.88589756, 2.82362708, 2.78061219])
# It seems that the model using all variables performs better

# 2. Spline
# Since it is too slow to do the k cross validation for spline,
# just use validation set to test the performance.
spline = Earth()
sub_cols = list(top[1:])  # Uses highly-correlated variables to build the model
sub_x_train = x_train[sub_cols]
array_sub_x = np.array(sub_x_train)
spline.fit(sub_x_train, y_train)
preds_val = spline.predict(x_val[sub_cols])
splineMSE = np.mean(
    (preds_val - array_y_val.ravel())**2)  # Calculates the mean squared error
splineMSE
# 2.457458862928802

# 3. Random Forest
# Since it is too slow to do the k cross validation for random forest,
# just use validation set to test the performance.
# Builds the model with 50 trees
rf = RandomForestRegressor(max_depth=20, random_state=42, n_estimators=50)
xArray = []
yArray = []

#seperating X and Y from the dataset
for eachXYPAir in dataset:
    x = eachXYPAir[0]
    y = eachXYPAir[1]
    xArray.append(x)
    yArray.append(y)

# print len(xArray)
xArray = numpy.asarray(xArray, "float32")  # converting to numpy array
# print len(yArray)
yArray = numpy.asarray(yArray, "float32")  # converting to numpy array
# Fit an Earth model
model = Earth(max_degree=1, verbose=True)  # initializing py- earth package

# making model for the data
model.fit(xArray, yArray)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(xArray)
# print y_hat
plt.figure()
plt.plot(xArray, yArray, 'r.')
plt.plot(xArray, y_hat, 'b.')
plt.show()
Ejemplo n.º 42
0
import numpy
from pyearth import Earth
from matplotlib import pyplot

#Create some fake data
numpy.random.seed(0)
m = 1000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m)

#Fit an Earth model
model = Earth()
model.fit(X, y)

#Print the model
print(model.trace())
print(model.summary())

#Plot the model
y_hat = model.predict(X)
pyplot.figure()
pyplot.plot(X[:, 6], y, 'r.')
pyplot.plot(X[:, 6], y_hat, 'b.')
pyplot.xlabel('x_6')
pyplot.ylabel('y')
pyplot.title('Simple Earth Example')
pyplot.show()
Ejemplo n.º 43
0
def test_linear_fit():
    from statsmodels.regression.linear_model import GLS, OLS

    earth = Earth(**default_params)
    earth.fit(X, y)
    earth.linear_fit(X, y)
    soln = OLS(y, earth.transform(X)).fit().params
    assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0)

    sample_weight = 1.0 / (numpy.random.normal(size=y.shape)**2)
    earth.fit(X, y)
    earth.linear_fit(X, y, sample_weight)
    soln = GLS(y, earth.transform(X), 1.0 / sample_weight).fit().params
    assert_almost_equal(numpy.mean((earth.coef_ - soln)**2), 0.0)
Ejemplo n.º 44
0
def test_feature_importance():
    criteria = ('rss', 'gcv', 'nb_subsets')
    for imp in criteria:
        earth = Earth(feature_importance_type=imp, **default_params)
        earth.fit(X, y)
        assert len(earth.feature_importances_) == X.shape[1]
    earth = Earth(feature_importance_type=criteria, **default_params)
    earth.fit(X, y)
    assert type(earth.feature_importances_) == dict
    assert set(earth.feature_importances_.keys()) == set(criteria)
    for crit, val in earth.feature_importances_.items():
        assert len(val) == X.shape[1]

    assert_raises(
        ValueError,
        Earth(feature_importance_type='bad_name', **default_params).fit, X, y)

    earth = Earth(feature_importance_type=('rss', ), **default_params)
    earth.fit(X, y)
    assert len(earth.feature_importances_) == X.shape[1]

    assert_raises(
        ValueError,
        Earth(feature_importance_type='rss',
              enable_pruning=False,
              **default_params).fit, X, y)
Ejemplo n.º 45
0
# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10

X = numpy.random.uniform(size=(m, n))
y = (10 * numpy.sin(numpy.pi * X[:, 0] * X[:, 1]) +
     20 * (X[:, 2] - 0.5) ** 2 +
     10 * X[:, 3] +
     5 * X[:, 4] + numpy.random.uniform(size=m))
# Fit an Earth model
criteria = ('rss', 'gcv', 'nb_subsets')
model = Earth(max_degree=3,
              max_terms=10,
              minspan_alpha=.5,
              feature_importance_type=criteria,
              verbose=True)
model.fit(X, y)
rf = RandomForestRegressor()
rf.fit(X, y)
# Print the model
print(model.trace())
print(model.summary())
print(model.summary_feature_importances(sort_by='gcv'))

# Plot the feature importances
importances = model.feature_importances_
importances['random_forest'] = rf.feature_importances_
criteria = criteria + ('random_forest',)
idx = 1
Ejemplo n.º 46
0
def test_export_python_function():
    for smooth in (True, False):
        model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y)
        export_model = export_python_function(model)
        for exp_pred, model_pred in zip(model.predict(X), export_model(X)):
            assert_almost_equal(exp_pred, model_pred)
Ejemplo n.º 47
0
# train = pd.read_csv('boston_data.csv')
# X = np.array(train.iloc[:, 0:13])
# y = np.array(train.iloc[:, 13])
#
# test = pd.read_csv('boston_test_data.csv')
# X_test = np.array(test.iloc[:, 0:13])
# X_test_id = test.iloc[:, 0]

np.random.seed(0)
m = 1000
n = 10
X = 80 * np.random.uniform(size=(m, n)) - 40
y = np.abs(X[:, 6] - 4.0) + 1 * np.random.normal(size=m)

#Fit an Earth model
model = Earth()
model.fit(X, y)

#Print the model
print(model.trace())
print(model.summary())

X, y = load_boston(return_X_y=True)
model_rsq_dic = {}

# # % lower status of the population
lstat_x = []
[lstat_x.append(row[12]) for row in X]

lstat_x = np.array(lstat_x).reshape(-1, 1)
Ejemplo n.º 48
0
def csc(df, hamming_string_dict, outdir, filename):
    """CRISPR Specificity Correction

    :param df: pandas dataframe with first column as gRNA and second column as logFC/metric
    :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics
    :param outdir: absolute filepath to output directory
    :param filename: name of input file to be used as part of output filename
    :return: CSC adjustment

    """
    # MARS compatible file
    df_mars_lst = []
    df_v = np.asarray(df)
    for i in range(len(df_v)):
        row_lst = []
        grna, metric = df_v[i][0], df_v[i][1]
        try:
            metric = float(metric)
        except ValueError:
            sys.stdout.write(
                'WARNING: encountered %s which is not float compatible, skipping\n'
                % metric)
            continue
        row_lst.append(grna)
        try:
            for jj in hamming_string_dict[grna]:
                row_lst.append(jj)
            row_lst.append(metric)
            df_mars_lst.append(row_lst)
        except KeyError:
            sys.stdout.write('\n%s not found in selected library: passing\n' %
                             grna)
            continue

    df = pd.DataFrame(df_mars_lst,
                      columns=[
                          'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3',
                          'original_value'
                      ])

    # exclude infinte specificity non-target gRNAs
    df = df[df['h0'] != 0]

    # isolate pertinent confounder variables
    df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']]

    # knots
    knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1])

    # training and testing data
    train_x, test_x, train_y, test_y = train_test_split(df_confounders,
                                                        df['original_value'],
                                                        test_size=0.10,
                                                        random_state=1)

    # Fit an Earth model
    model = Earth(feature_importance_type='gcv')
    try:
        model.fit(train_x, train_y)
    except ValueError:
        sys.stdout.write(
            '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n'
        )
        model_processed = 'F'
        sys.stdout.write(
            'training input x data\n %s\ntraining input y data\n %s\n' %
            (train_x, train_y))
        return model_processed

    # Print the model
    print(model.trace())
    print(model.summary())
    print(model.summary_feature_importances())

    # Plot the model
    y_hat = model.predict(test_x)

    # calculating RMSE values
    rms1 = sqrt(mean_squared_error(test_y, y_hat))
    print('\n\nRMSE on Predictions\n\n')
    print(rms1)

    # calculating R^2 for training
    print('\n\nR^2 on Training Data\n\n')
    print(model.score(train_x, train_y))

    # calculating R^2 for testing
    print('\n\nR^2 on Testing Data\n\n')
    print(model.score(test_x, test_y))

    # write out model metrics
    with open('%s/csc_model_metrics_%s.txt' % (outdir, filename),
              'w') as outfile:
        outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' %
                      (model.trace(), model.summary(),
                       model.summary_feature_importances(), rms1))

    if rms1 <= 1.0:

        #model processed
        model_processed = 'T'

        # full data prediction
        df['earth_adjustment'] = model.predict(df_confounders)

        # CSC correction
        df['earth_corrected'] = df['original_value'] - df['earth_adjustment']

        # main write out
        df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename))

        # pickle write out
        model_file = open(
            '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb')
        pl.dump(model, model_file)
        model_file.close()

        sys.stdout.write('\nCSC adjustment complete\n')
        sys.stdout.write('\nCSC output files written to %s\n' % outdir)
        return model_processed

    else:
        sys.stdout.write(
            '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n'
        )
        model_processed = 'F'
        return model_processed
    )

    # Select target and feature dataset(s) --> [target, feature1, feature2, ... ]
    datasets = [
        Dataset('runoff', database),
        Dataset('runoff', database).normalized(),
        Dataset('temp', database).normalized(),
        Dataset('precip', database).normalized(),
        Dataset('season', database).normalized()
    ]

    # Select leadtimes for target and feature. negative:past/positive:future
    leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]]

    # Select Model
    model_type = Earth(max_degree=10, smooth=True)
    #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000)
    #model_type = Regressor(
    #    layers=[
    #        Layer("Sigmoid",units=5),
    #        Layer("Linear", units=1)],
    #    learning_rate=0.1,
    #    n_iter=1000)

    # Set training interval
    startyear = DateFormat(1900, 1)
    endyear = DateFormat(2005, 36)
    training_daterange = DateFormat.decadal_daterange(startyear, endyear)

    # Set testing interval
    startyear = DateFormat(2006, 1)
Ejemplo n.º 50
0
import numpy
from pyearth import Earth
from matplotlib import pyplot

# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = 100 * \
    numpy.abs(numpy.sin((X[:, 6]) / 10) - 4.0) + \
    10 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth(max_degree=3, minspan_alpha=.5)
model.fit(X, y)

# Print the model
print model.trace()
print model.summary()

# Plot the model
y_hat = model.predict(X)
pyplot.figure()
pyplot.plot(X[:, 6], y, 'r.')
pyplot.plot(X[:, 6], y_hat, 'b.')
pyplot.show()
Ejemplo n.º 51
0
df = pd.read_csv(dataset, sep='\t')
df = pd.read_table(dataset)

gt_mapping = {'0/0': 0, '0/1': 1, '1/1': 2}

df['GT_GATK'] = df['GT_GATK'].map(gt_mapping)
df['GT_Varscan'] = df['GT_Varscan'].map(gt_mapping)
df['GT_Freebayes'] = df['GT_Freebayes'].map(gt_mapping)

X = df.values[:100, 5:]
X = set_missing_values(X)
#print df.columns[12]
y = np.random.randint(2, size=(int(np.shape(X)[0]), ))
#print X
#print y
earth_classifier = Pipeline([('earth', Earth(allow_missing=True)),
                             ('logistic', LogisticRegression())])

#earth_classifier = Pipeline([('earth', Earth(allow_missing=True)),
#                             ('logistic', RandomForestClassifier())])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

ec = earth_classifier.fit(X_train, y_train)

y_hat = earth_classifier.predict(X_test)
Ejemplo n.º 52
0
A simple example plotting a fit of the sine function.
"""
import numpy
import matplotlib.pyplot as plt

from pyearth import Earth

# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = 100 * \
    (numpy.sin((X[:, 6])) - 4.0) + \
    10 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth(max_degree=3, minspan_alpha=.5, verbose=True)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
plt.plot(X[:, 6], y, 'r.')
plt.plot(X[:, 6], y_hat, 'b.')
plt.show()
Ejemplo n.º 53
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyearth import Earth

## Load Data
df = pd.read_csv('hw2_data_2.txt', sep='\t')
X_train, y_train = np.array(df.iloc[:700, :-1]), np.array(df.iloc[:700, -1])
X_test, y_test = np.array(df.iloc[700:, :-1]), np.array(df.iloc[700:, -1])

## Using py-earth package, please install it follow the instructions in README file
clf = Earth()
clf.fit(X_train, y_train)

## Predict the value and calculate the testing error rate
pred_vals = clf.predict(X_test)

## Dichotomize the predicted outcome at the median
median = np.median(pred_vals)
res = np.where(pred_vals >= median, 1, -1)
# print(res)

error_rate = 1 - sum(res == y_test) / y_test.shape[0]
print("The testing error rate for MARS classifier is: %.4f" % error_rate)
Ejemplo n.º 54
0
import matplotlib.pyplot as plt

from pyearth import Earth

# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10
X = 20 * numpy.random.uniform(size=(m, n)) - 10
y = 10 * numpy.sin(X[:, 6]) + 0.25 * numpy.random.normal(size=m)

# Compute the known true derivative with respect to the predictive variable
y_prime = 10 * numpy.cos(X[:, 6])

# Fit an Earth model
model = Earth(max_degree=2, minspan_alpha=.5, smooth=True)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Get the predicted values and derivatives
y_hat = model.predict(X)
y_prime_hat = model.predict_deriv(X, 'x6')

# Plot true and predicted function values and derivatives
# for the predictive variable
plt.subplot(211)
plt.plot(X[:, 6], y, 'r.')
plt.plot(X[:, 6], y_hat, 'b.')
Ejemplo n.º 55
0
st = 'CPY012'
target,start_p,stop_p,host_path=station_sel(st,mode)
if mode =='hour': n_past,n_future = 24*7,72
elif mode =='day': n_past,n_future = 60,30

data = df[start_p:stop_p]
data['Day'] = data.index.dayofyear #add day
data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() 

conclude_df=pd.DataFrame()
for n_out in range(1,n_future+1):
    X,y,xlabels = to_supervise(data,target,n_out)
    criteria = ('rss', 'gcv', 'nb_subsets')
    model = Earth(enable_pruning = True,
                #   max_degree=3,
                #  max_terms=20,
                minspan_alpha=.5,
                feature_importance_type=criteria,
                verbose=True)
    model.fit(X,y,xlabels=xlabels)
    nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83]
    gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83]
    rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83]
    
    rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub)
    top20=pd.concat([rss,gcv,nbsub],ignore_index=True)
    top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature')
    top20['timestep'] = n_out

    #ADDED combine all result
    conclude_df = pd.concat([conclude_df,top20],ignore_index=True)
    if mode=='day':
Ejemplo n.º 56
0
'''
Created on Feb 15, 2016

@author: jason
'''

from .sklearntools import MultipleResponseEstimator, BackwardEliminationEstimatorCV, \
    QuantileRegressor, ResponseTransformingEstimator
from pyearth import Earth
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV

outcomes = ['admission_rate', 'prescription_cost_rate', '']

[('earth', Earth(max_degree=2)), ('elim', BackwardEliminationEstimatorCV())]


normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)

# feature selection
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

# naive bayes implementation
from matplotlib import pyplot
# create model
from pyearth import Earth
model = Earth()

# fit the earth model
model.fit(X, y)
print(" Model:")
print(model)

# make predictions
expected = y
predicted = model.predict(X)

# since the quality can only be a number, round all the outputs off
for i in range(len(predicted)):
    predicted[i] = int(round(predicted[i]))

# check how far the predictions are from actual values
test = context.catalog.load('test_maskedv2')
     = context.catalog.load('variable_descriptions_v2')
sample_submission = context.catalog.load('samplesubmissionv2')

#%%
train.target_pct_vunerable.hvplot.kde()


# %%
transformer = Pipeline([('poly', PolynomialFeatures()),
                        ('scale', StandardScaler()),
                        ('pca', PCA(15)),
                        ('rescale', StandardScaler())])

glm = TweedieGLM(power=0, max_iter=1000)
mars = Earth()      
model = Pipeline([('transformer', transformer),
                  ('model', mars)])

offset = 1e-9
def add(y):
    return (y/100 + offset)

def subtract(y):
    return ((y) - offset)*100


link = Pipeline([('function', FunctionTransformer(add, subtract, validate=True))])
scorer = get_scorer('neg_root_mean_squared_error')

pipeline = TransformedTargetRegressor(regressor=model, transformer=link)
Ejemplo n.º 59
0
def test_score():
    earth = Earth(**default_params)
    model = earth.fit(X, y)
    record = model.pruning_trace()
    rsq = record.rsq(record.get_selected())
    assert_almost_equal(rsq, model.score(X, y))
Ejemplo n.º 60
0
total_data = pd.concat([
    total_category_data,
    total_numeric_data.clip(total_numeric_data.quantile(0.01).to_dict(),
                            total_numeric_data.quantile(0.99).to_dict(),
                            axis=1)
],
                       axis=1)
print(total_data.shape)
total_data = total_data.fillna(total_data.mean())

print(total_data.head(5))

train_data = total_data[total_data.index < 1460]
test_data = total_data[total_data.index >= 1460]

rfe = RFE(Earth(), step=15, verbose=2).fit(train_data, train_Y)
validKeys = list(train_data.columns[rfe.support_])

train_data = train_data[validKeys]
test_data = test_data[validKeys]

model = Earth().fit(train_data, train_Y)
predict = model.predict(test_data)
predict = np.exp(predict)

submission = pd.DataFrame()
submission['Id'] = test_index
submission['SalePrice'] = predict
submission.to_csv(
    "C:\\Users\\hongj\\Desktop\\kaggle\\house_price\\submission.csv",
    index=False)