Example #1
0
def test_fit():
    earth = Earth(**default_params)
    earth.fit(X, y)
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__), 'earth_regress.txt')
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_equal(res, prev)
Example #2
0
def test_smooth():
        model = Earth(penalty=1, smooth=True)
        model.fit(X, y)
        res = str(model.trace()) + '\n' + model.summary()
        filename = os.path.join(os.path.dirname(__file__),
                                'earth_regress_smooth.txt')
        with open(filename, 'r') as fl:
            prev = fl.read()
        assert_equal(res, prev)
Example #3
0
def test_fit():
    earth = Earth(**default_params)
    earth.fit(X, y)
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress.txt')
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_equal(res, prev)
Example #4
0
def test_smooth():
    model = Earth(penalty=1, smooth=True)
    model.fit(X, y)
    res = str(model.trace()) + '\n' + model.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_smooth.txt')
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_equal(res, prev)
Example #5
0
def test_linvars():
    earth = Earth(**default_params)
    earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_linvars_regress.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()

    assert_equal(res, prev)
Example #6
0
def test_linvars():
    earth = Earth(**default_params)
    earth.fit(X, y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    res = str(earth.trace()) + '\n' + earth.summary()
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_linvars_regress.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()

    assert_equal(res, prev)
Example #7
0
def mars(p, xLabels, yLabel):
    global image_num
    criteria = ('rss', 'gcv', 'nb_subsets')
    # Randomly shuffle rows
    p = p.sample(frac=1).reset_index(drop=True)
    # Split train and test
    twentyPercent = -1 * round(p.shape[0] * 0.2)
    n = len(xLabels)
    xCol = p[xLabels].values.reshape(-1, n)
    X_train = xCol[:twentyPercent]
    X_test = xCol[twentyPercent:]
    y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1)
    y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1)
    # Fit MARS model
    model = Earth(feature_importance_type=criteria)
    model.fit(X_train, y_train)
    # Make predictions
    predicted = model.predict(X_test)
    r2 = r2_score(y_test, predicted)
    mse = mean_squared_error(y_test, predicted)
    predicted = predicted.reshape(-1, 1)
    # Plot residuals
    plotResiduals(y_test, predicted)
    # Print summary
    print(model.trace())
    print(model.summary())
    # Plot feature importances
    importances = model.feature_importances_
    for crit in criteria:
        x = list(range(0, len(xLabels)))
        sorted_rss = [
            list(t)
            for t in sorted(zip(importances[crit], xLabels), reverse=True)
        ]
        coeff = []
        feature = []
        for j in range(0, len(sorted_rss)):
            coeff.append(abs(sorted_rss[j][0]))
            feature.append(featureToLabel[sorted_rss[j][1]])
        plt.clf()
        plt.xticks(x, feature, rotation='vertical')
        plt.bar(x, coeff, align='center', alpha=0.5)
        plt.xlabel('Features')
        label = "Importance (" + crit + ")"
        plt.ylabel(label)
        plt.tight_layout()
        label = "mars_imp_" + crit
        plt.show()
        plt.savefig(image_path.format(image_num), bbox_inches='tight')
        image_num += 1
    return r2, mse
Example #8
0
x = train
del x['<HIGH>']
del x['<LOW>']
del x['<CLOSE>']
del x['<VOL>']

numeric_feats = x.dtypes[x.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda g: skew(g.dropna())) 
skewed_feats = skewed_feats.index 

x[skewed_feats] = np.log1p(x[skewed_feats])

# Fit MARS
mars = Earth(allow_missing=True)
mars.fit(x,y)
print(mars.trace())
print(mars.summary())

def inverse(x):
    x = np.exp(x) - 1
    return x

def graph(x, y, y2, a, b, Title):
    fig = plt.figure()
    plt.plot(x[a:b],y[a:b],'r', label='Actual')
    plt.plot(x[a:b],y2[a:b],'b', label='Predicted')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title(Title)
    plt.legend(loc='upper left')
    plt.show()
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from pyearth import Earth
from matplotlib import pyplot

df = pd.read_excel('relay-foods.xlsx', sheetname='Purchase Data - Full Study')
df['OrderId'] = df['OrderId'].astype('category')
df['CommonId'] = df['CommonId'].astype('category')


df['OrderId'] = df['OrderId'].astype('category')
df['CommonId'] = df['CommonId'].astype('category')
df.dtypes
col_names = ['OrderDate', 'PickupDate']
df = df.drop(col_names, axis=1)
y = df['TotalCharges']
df_2 = df[['OrderId', 'UserId', 'PupId']]
#del df['OrderDate']
X = [dict(r.iteritems()) for _, r in df_2.iterrows()]
train_fea = DictVectorizer().fit_transform(X)

#Fit an Earth model
model = Earth()
model.fit(train_fea,y)

#Print the model
print(model.trace())
print(model.summary())

#Plot the model
y_hat = model.predict(X)
#=========================================================================
# V-Function Example
#=========================================================================
# Create some fake data
numpy.random.seed(0)
m = 1000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth()
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
pyplot.figure()
pyplot.plot(X[:, 6], y, 'r.')
pyplot.plot(X[:, 6], y_hat, 'b.')
pyplot.xlabel('x_6')
pyplot.ylabel('y')
pyplot.title('Simple Earth Example')
pyplot.savefig('simple_earth_example.png')

#=========================================================================
# Hinge plot
#=========================================================================
Example #11
0
import numpy
from pyearth import Earth
from matplotlib import pyplot

# Create some fake data
numpy.random.seed(2)
m = 1000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth(max_degree=1)
model.fit(X, y)

# Print the model
print model.trace()
print model.summary()

# Plot the model
y_hat = model.predict(X)
pyplot.figure()
pyplot.plot(X[:, 6], y, 'r.')
pyplot.plot(X[:, 6], y_hat, 'b.')
pyplot.show()
Example #12
0
def csc(df, hamming_string_dict, outdir, filename):
    """CRISPR Specificity Correction

    :param df: pandas dataframe with first column as gRNA and second column as logFC/metric
    :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics
    :param outdir: absolute filepath to output directory
    :param filename: name of input file to be used as part of output filename
    :return: CSC adjustment

    """
    # MARS compatible file
    df_mars_lst = []
    df_v = np.asarray(df)
    for i in range(len(df_v)):
        row_lst = []
        grna, metric = df_v[i][0], df_v[i][1]
        try:
            metric = float(metric)
        except ValueError:
            sys.stdout.write(
                'WARNING: encountered %s which is not float compatible, skipping\n'
                % metric)
            continue
        row_lst.append(grna)
        try:
            for jj in hamming_string_dict[grna]:
                row_lst.append(jj)
            row_lst.append(metric)
            df_mars_lst.append(row_lst)
        except KeyError:
            sys.stdout.write('\n%s not found in selected library: passing\n' %
                             grna)
            continue

    df = pd.DataFrame(df_mars_lst,
                      columns=[
                          'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3',
                          'original_value'
                      ])

    # exclude infinte specificity non-target gRNAs
    df = df[df['h0'] != 0]

    # isolate pertinent confounder variables
    df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']]

    # knots
    knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1])

    # training and testing data
    train_x, test_x, train_y, test_y = train_test_split(df_confounders,
                                                        df['original_value'],
                                                        test_size=0.10,
                                                        random_state=1)

    # Fit an Earth model
    model = Earth(feature_importance_type='gcv')
    try:
        model.fit(train_x, train_y)
    except ValueError:
        sys.stdout.write(
            '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n'
        )
        model_processed = 'F'
        sys.stdout.write(
            'training input x data\n %s\ntraining input y data\n %s\n' %
            (train_x, train_y))
        return model_processed

    # Print the model
    print(model.trace())
    print(model.summary())
    print(model.summary_feature_importances())

    # Plot the model
    y_hat = model.predict(test_x)

    # calculating RMSE values
    rms1 = sqrt(mean_squared_error(test_y, y_hat))
    print('\n\nRMSE on Predictions\n\n')
    print(rms1)

    # calculating R^2 for training
    print('\n\nR^2 on Training Data\n\n')
    print(model.score(train_x, train_y))

    # calculating R^2 for testing
    print('\n\nR^2 on Testing Data\n\n')
    print(model.score(test_x, test_y))

    # write out model metrics
    with open('%s/csc_model_metrics_%s.txt' % (outdir, filename),
              'w') as outfile:
        outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' %
                      (model.trace(), model.summary(),
                       model.summary_feature_importances(), rms1))

    if rms1 <= 1.0:

        #model processed
        model_processed = 'T'

        # full data prediction
        df['earth_adjustment'] = model.predict(df_confounders)

        # CSC correction
        df['earth_corrected'] = df['original_value'] - df['earth_adjustment']

        # main write out
        df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename))

        # pickle write out
        model_file = open(
            '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb')
        pl.dump(model, model_file)
        model_file.close()

        sys.stdout.write('\nCSC adjustment complete\n')
        sys.stdout.write('\nCSC output files written to %s\n' % outdir)
        return model_processed

    else:
        sys.stdout.write(
            '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n'
        )
        model_processed = 'F'
        return model_processed
Example #13
0
# test = pd.read_csv('boston_test_data.csv')
# X_test = np.array(test.iloc[:, 0:13])
# X_test_id = test.iloc[:, 0]

np.random.seed(0)
m = 1000
n = 10
X = 80 * np.random.uniform(size=(m, n)) - 40
y = np.abs(X[:, 6] - 4.0) + 1 * np.random.normal(size=m)

#Fit an Earth model
model = Earth()
model.fit(X, y)

#Print the model
print(model.trace())
print(model.summary())

X, y = load_boston(return_X_y=True)
model_rsq_dic = {}

# # % lower status of the population
lstat_x = []
[lstat_x.append(row[12]) for row in X]

lstat_x = np.array(lstat_x).reshape(-1, 1)

#lstat_x = X
print(lstat_x.shape)
y = y.reshape(-1, 1)
print(y.shape)
Example #14
0
class TestEarth(object):

    def __init__(self):
        numpy.random.seed(0)
        self.basis = Basis(10)
        constant = ConstantBasisFunction()
        self.basis.append(constant)
        bf1 = HingeBasisFunction(constant, 0.1, 10, 1, False, 'x1')
        bf2 = HingeBasisFunction(constant, 0.1, 10, 1, True, 'x1')
        bf3 = LinearBasisFunction(bf1, 2, 'x2')
        self.basis.append(bf1)
        self.basis.append(bf2)
        self.basis.append(bf3)
        self.X = numpy.random.normal(size=(100, 10))
        self.B = numpy.empty(shape=(100, 4), dtype=numpy.float64)
        self.basis.transform(self.X, self.B)
        self.beta = numpy.random.normal(size=4)
        self.y = numpy.empty(shape=100, dtype=numpy.float64)
        self.y[:] = numpy.dot(
            self.B, self.beta) + numpy.random.normal(size=100)
        self.earth = Earth(penalty=1)

    def test_get_params(self):
        assert_equal(
            Earth().get_params(), {'penalty': None, 'min_search_points': None,
                                   'endspan_alpha': None, 'check_every': None,
                                   'max_terms': None, 'max_degree': None,
                                   'minspan_alpha': None, 'thresh': None,
                                   'minspan': None, 'endspan': None,
                                   'allow_linear': None, 'smooth': None})
        assert_equal(
            Earth(
                max_degree=3).get_params(), {'penalty': None,
                                             'min_search_points': None,
                                             'endspan_alpha': None,
                                             'check_every': None,
                                             'max_terms': None, 'max_degree': 3,
                                             'minspan_alpha': None,
                                             'thresh': None, 'minspan': None,
                                             'endspan': None,
                                             'allow_linear': None,
                                             'smooth': None})

    @if_statsmodels
    def test_linear_fit(self):
        from statsmodels.regression.linear_model import GLS, OLS
        self.earth.fit(self.X, self.y)
        self.earth._Earth__linear_fit(self.X, self.y)
        soln = OLS(self.y, self.earth.transform(self.X)).fit().params
        assert_almost_equal(numpy.mean((self.earth.coef_ - soln) ** 2), 0.0)

        sample_weight = 1.0 / (numpy.random.normal(size=self.y.shape) ** 2)
        self.earth.fit(self.X, self.y)
        self.earth._Earth__linear_fit(self.X, self.y, sample_weight)
        soln = GLS(self.y, self.earth.transform(
            self.X), 1.0 / sample_weight).fit().params
        assert_almost_equal(numpy.mean((self.earth.coef_ - soln) ** 2), 0.0)

    def test_sample_weight(self):
        group = numpy.random.binomial(1, .5, size=1000) == 1
        sample_weight = 1 / (group * 100 + 1.0)
        x = numpy.random.uniform(-10, 10, size=1000)
        y = numpy.abs(x)
        y[group] = numpy.abs(x[group] - 5)
        y += numpy.random.normal(0, 1, size=1000)
        model = Earth().fit(x, y, sample_weight=sample_weight)

        # Check that the model fits better for the more heavily weighted group
        assert_true(model.score(x[group], y[group]) < model.score(
            x[numpy.logical_not(group)], y[numpy.logical_not(group)]))

        # Make sure that the score function gives the same answer as the trace
        pruning_trace = model.pruning_trace()
        rsq_trace = pruning_trace.rsq(model.pruning_trace().get_selected())
        assert_almost_equal(model.score(x, y, sample_weight=sample_weight),
                            rsq_trace)

        # Uncomment below to see what this test situation looks like
#        from matplotlib import pyplot
#        print model.summary()
#        print model.score(x,y,sample_weight = sample_weight)
#        pyplot.figure()
#        pyplot.plot(x,y,'b.')
#        pyplot.plot(x,model.predict(x),'r.')
#        pyplot.show()

    def test_fit(self):
        self.earth.fit(self.X, self.y)
        res = str(self.earth.trace()) + '\n' + self.earth.summary()
#            fl.write(res)
        filename = os.path.join(os.path.dirname(__file__),
                                'earth_regress.txt')
        with open(filename, 'r') as fl:
            prev = fl.read()
        assert_equal(res, prev)

    def test_smooth(self):
        model = Earth(penalty=1, smooth=True)
        model.fit(self.X, self.y)
        res = str(model.trace()) + '\n' + model.summary()
        filename = os.path.join(os.path.dirname(__file__),
                                'earth_regress_smooth.txt')
        with open(filename, 'r') as fl:
            prev = fl.read()
        assert_equal(res, prev)

    def test_linvars(self):
        self.earth.fit(self.X, self.y, linvars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        res = str(self.earth.trace()) + '\n' + self.earth.summary()
        filename = os.path.join(os.path.dirname(__file__),
                                'earth_linvars_regress.txt')
        with open(filename, 'r') as fl:
            prev = fl.read()
        assert_equal(res, prev)

    def test_score(self):
        model = self.earth.fit(self.X, self.y)
        record = model.pruning_trace()
        rsq = record.rsq(record.get_selected())
        assert_almost_equal(rsq, model.score(self.X, self.y))

    @if_pandas
    @if_environ_has('test_pathological_cases')
    def test_pathological_cases(self):
        import pandas
        directory = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'pathological_data')
        cases = {'issue_44': {},
                 'issue_50': {'penalty': 0.5,
                              'minspan': 1,
                              'allow_linear': False,
                              'endspan': 1,
                              'check_every': 1,
                              'sample_weight': 'issue_50_weight.csv'}}
        for case, settings in cases.iteritems():
            data = pandas.read_csv(os.path.join(directory, case + '.csv'))
            y = data['y']
            del data['y']
            X = data
            if 'sample_weight' in settings:
                filename = os.path.join(directory, settings['sample_weight'])
                sample_weight = pandas.read_csv(filename)['sample_weight']
                del settings['sample_weight']
            else:
                sample_weight = None
            model = Earth(**settings)
            model.fit(X, y, sample_weight=sample_weight)
            with open(os.path.join(directory, case + '.txt'), 'r') as infile:
                correct = infile.read()
            assert_equal(model.summary(), correct)

    @if_pandas
    def test_pandas_compatibility(self):
        import pandas
        X = pandas.DataFrame(self.X)
        y = pandas.DataFrame(self.y)
        colnames = ['xx' + str(i) for i in range(X.shape[1])]
        X.columns = colnames
        model = self.earth.fit(X, y)
        assert_list_equal(
            colnames, model.forward_trace()._getstate()['xlabels'])

    @if_patsy
    @if_pandas
    def test_patsy_compatibility(self):
        import pandas
        import patsy
        X = pandas.DataFrame(self.X)
        y = pandas.DataFrame(self.y)
        colnames = ['xx' + str(i) for i in range(X.shape[1])]
        X.columns = colnames
        X['y'] = y
        y, X = patsy.dmatrices(
            'y ~ xx0 + xx1 + xx2 + xx3 + xx4 + xx5 + xx6 + xx7 + xx8 + xx9 - 1',
            data=X)
        model = self.earth.fit(X, y)
        assert_list_equal(
            colnames, model.forward_trace()._getstate()['xlabels'])

    def test_pickle_compatibility(self):
        model = self.earth.fit(self.X, self.y)
        model_copy = pickle.loads(pickle.dumps(model))
        assert_true(model_copy == model)
        assert_true(
            numpy.all(model.predict(self.X) == model_copy.predict(self.X)))
        assert_true(model.basis_[0] is model.basis_[1]._get_root())
        assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())

    def test_copy_compatibility(self):
        model = self.earth.fit(self.X, self.y)
        model_copy = copy.copy(model)
        assert_true(model_copy == model)
        assert_true(
            numpy.all(model.predict(self.X) == model_copy.predict(self.X)))
        assert_true(model.basis_[0] is model.basis_[1]._get_root())
        assert_true(model_copy.basis_[0] is model_copy.basis_[1]._get_root())