train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0)
        sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p]))

        logger.info('Finished loading\nCommencing Evaluation')
        aggregates = ['count','sum_','avg']
        agg_map = {'count' :4, 'sum_':5, 'avg':6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x','y','x_range','y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            mars_ = Earth(feature_importance_type='gcv',)
            vigilance_x = np.linspace(0.01, 3, Config.vigilance_x_frequency)
            for sens_x in vigilance_x:
                lsnr = PR(mars_,vigil_x=sens_x)
                lsnr.fit(X_train,y_train)



                logger.info("Accuracy Evaluation on Test set with vigil_x={0}\n=====================".format(sens_x))
                for i in range(1000):
                    #Obtain query from test-set
                    dataset = p
                    printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50)

                    q = test_df.iloc[i].values[:4].reshape(1,-1)
                    q = sc.transform(q)
y = train['transactionRevenue']
X =train.drop(["fullVisitorId","transactionRevenue"],axis=1)  


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor 
from pyearth import Earth



regression_OLS=LinearRegression()
regression_Lasso=Lasso(precompute=True,max_iter=10000,alpha=3.0)
regression_RF=RandomForestRegressor(max_leaf_nodes=100,max_features=100)
#regression_SVR=SVR(kernel='rbf', C=1e3, gamma=0.1)
regression_spline = Earth()

from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

print('training data has %d observation with %d features'% X_train.shape)
print('test data has %d observation with %d features'% X_test.shape)

from sklearn.metrics import mean_squared_error,explained_variance_score 

model_names = ['Linear Regression OLS','Linear Regression Lasso','Random Forest','Spline_regression']
model_list = [regression_OLS,regression_Lasso, regression_RF,regression_spline]
count = 0
for regression in model_list:
    model = regression.fit(X_train,y_train)
    y_preds = model.predict(X_test)
Example #3
0
df = pd.read_csv(dataset, sep='\t')
df = pd.read_table(dataset)

gt_mapping = {'0/0': 0, '0/1': 1, '1/1': 2}

df['GT_GATK'] = df['GT_GATK'].map(gt_mapping)
df['GT_Varscan'] = df['GT_Varscan'].map(gt_mapping)
df['GT_Freebayes'] = df['GT_Freebayes'].map(gt_mapping)

X = df.values[:100, 5:]
X = set_missing_values(X)
#print df.columns[12]
y = np.random.randint(2, size=(int(np.shape(X)[0]), ))
#print X
#print y
earth_classifier = Pipeline([('earth', Earth(allow_missing=True)),
                             ('logistic', LogisticRegression())])

#earth_classifier = Pipeline([('earth', Earth(allow_missing=True)),
#                             ('logistic', RandomForestClassifier())])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

ec = earth_classifier.fit(X_train, y_train)

y_hat = earth_classifier.predict(X_test)
Example #4
0
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outElasticNet.csv')

regr_2.fit(X_train, y_train)
y_eval = regr_2.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions'
                                   ]).to_csv('outAdaBoostRegressor.csv')

clf = linear_model.Lars(n_nonzero_coefs=1)
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outLARS.csv')
"""
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outAdaBoostRegressor.csv')
"""

from pyearth import Earth

clf = Earth()
clf.fit(X_train, y_train)
y_eval = clf.predict(X)
prediction = pd.DataFrame(y_eval,
                          columns=['predictions']).to_csv('outMARS.csv')
Example #5
0
test = pd.read_csv('../data/modeltest.csv',index_col=0)
label = train['Response'].values


featextra= pd.read_csv('../feat/improve.csv',index_col=0)
train = pd.concat([train,featextra.loc[train.index]],axis=1)
test = pd.concat([test,featextra.loc[test.index]],axis=1)

featextra= pd.read_csv('../feat/duplicate.csv',index_col=0)
train = pd.concat([train,featextra.loc[train.index]],axis=1)
test = pd.concat([test,featextra.loc[test.index]],axis=1)


feat = train.columns.drop('Response',1)
#Build an Earth model with a logisticregression pipeline
earth_pipe = Pipeline([('earth',Earth(use_fast=True,allow_missing=True,penalty=0.5,max_degree=3)),('log',LogisticRegression())])
earth_pipe.fit(train[feat],label)

#Parameter tuning

#param_grid = {'earth__penalty': np.arange(1,11,2),'earth__max_degree': range(1,4)}
#
#gs1 = GridSearchCV(earth_pipe,param_grid,n_jobs=1,pre_dispatch=1,cv=StratifiedKFold(label, n_folds=5, shuffle=True),scoring='log_loss',verbose=2)
#
#
#gs1.fit(train[feat],label)
#
#print gs1.best_params_
#print gs1.best_score_
#
##----------------------------------------------------------
Example #6
0
def translation_correction(cell_mesh, cell_mesh_2, buffer_cell,\
	x_pos, y_pos, z_pos, x_pos_new, y_pos_new, z_pos_new, closest_no_conflict, directory ):
	
	x_min = np.min([np.min(cell_mesh[:,0]),np.min(cell_mesh_2[:,0])]) - buffer_cell 
	x_max = np.max([np.max(cell_mesh[:,0]),np.max(cell_mesh_2[:,0])]) + buffer_cell
	y_min = np.min([np.min(cell_mesh[:,1]),np.min(cell_mesh_2[:,1])]) - buffer_cell
	y_max = np.max([np.max(cell_mesh[:,1]),np.max(cell_mesh_2[:,1])]) + buffer_cell
	z_min = np.min([np.min(cell_mesh[:,2]),np.min(cell_mesh_2[:,2])]) - buffer_cell
	z_max = np.max([np.max(cell_mesh[:,2]),np.max(cell_mesh_2[:,2])]) + buffer_cell
	
	num_pts = len(x_pos)
	X = []; Y = []; Z = []; U = []; V = []; W = [] 
	for kk in range(0,num_pts):
		idx = closest_no_conflict[kk]
		if idx < len(closest_no_conflict):
			U.append(x_pos_new[idx] - x_pos[kk])
			V.append(y_pos_new[idx] - y_pos[kk])
			W.append(z_pos_new[idx] - z_pos[kk])
			X.append(x_pos_new[idx]); Y.append(y_pos_new[idx]); Z.append(z_pos_new[idx])
	
	# --> limit to points that aren't too close to the cell 
	X_safe = []; Y_safe = []; Z_safe = []; U_safe = []; V_safe = []; W_safe = [] 
	num_pts = len(U)
	for kk in range(0,num_pts):
		x_out = X[kk] < x_min or X[kk] > x_max
		y_out = Y[kk] < y_min or Y[kk] > y_max
		z_out = Z[kk] < z_min or Z[kk] > z_max
		if x_out or y_out or z_out:
			X_safe.append(X[kk])
			Y_safe.append(Y[kk])
			Z_safe.append(Z[kk])
			U_safe.append(U[kk])
			V_safe.append(V[kk])
			W_safe.append(W[kk])

	X_safe = np.asarray(X_safe); Y_safe = np.asarray(Y_safe); Z_safe = np.asarray(Z_safe)
	U_safe = np.asarray(U_safe); V_safe = np.asarray(V_safe); W_safe = np.asarray(W_safe)
	
	# --> fit MARS models 
	model_U = Earth(max_degree=2,max_terms=10)
	model_U.fit(Z_safe,U_safe)
	model_V = Earth(max_degree=2,max_terms=10)
	model_V.fit(Z_safe,V_safe)
	model_W = Earth(max_degree=2,max_terms=10)
	model_W.fit(Z_safe,W_safe)
		
	# --> re-define Z 
	pred_U = model_U.predict(z_pos_new)
	pred_V = model_V.predict(z_pos_new)
	pred_W = model_W.predict(z_pos_new)
	
	# --> correct new bead positions 
	for kk in range(0,len(x_pos_new)):
		x_pos_new[kk] = x_pos_new[kk] - pred_U[kk] 
		y_pos_new[kk] = y_pos_new[kk] - pred_V[kk]
		z_pos_new[kk] = z_pos_new[kk] - pred_W[kk] 
	
	# --> correct new cell position 
	pred_cell_0 = model_U.predict(cell_mesh_2[:,0])
	pred_cell_1 = model_V.predict(cell_mesh_2[:,1])
	pred_cell_2 = model_W.predict(cell_mesh_2[:,2])
	
	cell_mesh_2_new = np.zeros(cell_mesh_2.shape)
	cell_mesh_2_new[:,0] = cell_mesh_2[:,0] - pred_cell_0
	cell_mesh_2_new[:,1] = cell_mesh_2[:,1] - pred_cell_1
	cell_mesh_2_new[:,2] = cell_mesh_2[:,2] - pred_cell_2
	
	# --> plot MARS models 
	Z_line = np.linspace(np.min(Z),np.max(Z),100)
	pred_line_U = model_U.predict(Z_line)
	pred_line_V = model_V.predict(Z_line)
	pred_line_W = model_W.predict(Z_line)
	
	plt.figure(figsize=(15,5))
	plt.subplot(1,3,1)
	plt.plot(Z,U,'b.',label='x raw')
	plt.plot(Z_line,pred_line_U,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('x displacements')
	plt.subplot(1,3,2)
	plt.plot(Z,V,'r.',label='y raw')
	plt.plot(Z_line,pred_line_V,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('y displacements')
	plt.subplot(1,3,3)
	plt.plot(Z,W,'g.',label='z raw')
	plt.plot(Z_line,pred_line_W,'k--',label='fit')
	plt.xlabel('z position'); plt.ylabel('displacement')
	plt.tight_layout(); plt.legend(); plt.title('z displacements')
	plt.savefig(directory + '/translation_correction.png')
	
	return x_pos_new, y_pos_new, z_pos_new, cell_mesh_2_new 
Example #7
0
def fit_and_predict(X_training,
                    Y_training,
                    X_validation,
                    hprm,
                    assignments={}):
    #assert type(hprm['learning.model.benchmarks.independent_models']) == bool
    #assert type(hprm['learning.model.benchmarks.individual_inputs'])  == bool
    # if not hprm['learning.model.benchmarks.independent_models']:
    # Y_hat_training, Y_hat_validation, model = call_fitter(inputs_training,
    #                                                       Y_training,
    #                                                       inputs_validation,
    #                                                       hprm,
    #                                                       )

    # Y_hat_training   = pd.DataFrame(Y_hat_training,
    #                                 index   = Y_training.index,
    #                                 columns = Y_training.columns,
    #                                 )
    # Y_hat_validation = pd.DataFrame(Y_hat_validation,
    #                                 index   = inputs_validation.index,
    #                                 columns = Y_training.columns,
    #                                 )
    # else:
    #     Y_hat_training   = pd.DataFrame(0,
    #                                     index   = Y_training.index,
    #                                     columns = Y_training.columns,
    #                                     )
    #     Y_hat_validation = pd.DataFrame(0,
    #                                     index   = inputs_validation.index,
    #                                     columns = Y_training.columns,
    #                                     )
    #     model = {}
    #     if hprm['learning.model.benchmarks.individual_inputs']:
    #         for ii, site_name in enumerate(Y_training.columns):
    #             print('\r{0}/{1}'.format(ii, Y_training.shape[1]), end = '')
    #             columns_to_keep = [
    #                                 (name_input, transformation, parameter, location)
    #                                 for (name_input, transformation, parameter, location) in inputs_training.columns
    #                                 if (   name_input not in assignments
    #                                     or location in assignments[name_input]
    #                                     )
    #                                 ]
    #             Y_hat_training[site_name], Y_hat_validation[site_name], model[site_name] = call_fitter(inputs_training[columns_to_keep],
    #                                                                                                     Y_training[site_name],
    #                                                                                                     inputs_validation[columns_to_keep],
    #                                                                                                     hprm,
    #                                                                                                     )
    #     else :
    #         for ii, site_name in enumerate(Y_training.columns):
    #             print('\r{0}/{1}'.format(ii, Y_training.shape[1]), end = '')
    #             Y_hat_training[site_name], Y_hat_validation[site_name], model[site_name] = call_fitter(inputs_training,
    #                                                                                                     Y_training[site_name],
    #                                                                                                     inputs_validation,
    #                                                                                                     hprm,
    #                                                                                                     )
    #     return Y_hat_training, Y_hat_validation, model

    # def call_fitter(X_training,
    #                 Y_training,
    #                 X_validation,
    #                 hprm,
    #                 ):

    X_mean = X_training.mean(axis=0)
    X_std = X_training.std(axis=0)
    X_training = (X_training - X_mean) / X_std
    X_validation = (X_validation - X_mean) / X_std

    method = hprm['learning.model']

    if Y_training.ndim == 2 and Y_training.shape[1] == 1:
        Y_training = Y_training[:, 0]

    if method in {'random_forests', 'regression_tree'}:
        pass
    elif method in {'xgboost', 'svr', 'mars'}:
        assert Y_training.ndim == 1

    if method == 'mars':
        model = Earth(
            verbose=hprm['mars.verbose'],
            thresh=hprm['mars.thresh'],
        )
    elif method == 'random_forests':
        model = RandomForestRegressor(
            n_estimators=hprm['random_forests.n_estimators'])
    elif method == 'regression_tree':
        model = DecisionTreeRegressor()
    elif method == 'svr':
        model = SVR(
            C=hprm['svr.C'],
            epsilon=hprm['svr.epsilon'],
        )
    elif method == 'xgboost':
        model = XGBRegressor()
    else:
        raise ValueError

    model.fit(
        X_training.values,
        Y_training.values,
    )
    Y_hat_training = model.predict(X_training.values)
    Y_hat_validation = model.predict(X_validation.values)
    return Y_hat_training, Y_hat_validation, model
Example #8
0
    model = Earth(max_terms=50, max_degree=3)
    model.fit(X,y)

    #Print the model
    #print(model.trace())
    print(model.summary())


    print "MARS  degree 5"

    model = Earth(max_terms=20, max_degree=5)
    model.fit(X,y)

    #Print the model
    #print(model.trace())
    print(model.summary())
   
    """

    print "====================================="

    print "MARS  degree 1"
    model = Earth(max_terms=70, max_degree=1)
    print "Score: {}".format ( crossValidation ( model, X, y ) )

    print "MARS  degree 3"
    model = Earth(max_terms=50, max_degree=3)
    crossValidation ( model, X, y )
    print "Score: {}".format ( crossValidation ( model, X, y ) )
Example #9
0
def accuracy_on_higgs():
    logger.info("Starting Accuracy Tests on Higgs")
    logger.info("================================")
    df = pd.read_csv('input/sample_higgs_0.01.csv', index_col=0)
    X = df[['m_bb','m_wwbb']].dropna().values
    y = df['label']
    min_ = np.min(X, axis=0)
    max_ = np.max(X, axis=0)
    X = (X-min_) / (max_-min_)
    data = np.column_stack((X,y))
    x = np.linspace(0.1,0.9,7)
    xx,yy = np.meshgrid(x,x)
    DIMS = X.shape[1]
    cov = np.identity(DIMS)*0.001
    cluster_centers = np.column_stack((xx.ravel(),yy.ravel()))
    query_centers = []
    #Generate queries over cluster centers
    for c in cluster_centers:
        queries = np.random.multivariate_normal(np.array(c), cov, size=40)
        query_centers.append(queries)
    query_centers = np.array(query_centers).reshape(-1,DIMS)

    ranges = np.random.uniform(low=0.005**(1/3), high=0.25**(1/3), size=(query_centers.shape[0], DIMS))
    queries = []
    empty = 0
    for q,r in zip(query_centers,ranges):
            b = generate_boolean_vector(data,q,r,2)
            res = data[b]
            if res.shape[0]==0:
                empty+=1

            ans = float(np.mean(res[:,-1])) if res.shape[0]!=0 else 0
            qt = q.tolist()
            qt += r.tolist()
            qt.append(ans)
            queries.append(qt)
    qs = np.array(queries).reshape(-1, 2*DIMS+1)
    X_train, X_test, y_train, y_test = train_test_split(
         qs[:,:qs.shape[1]-1], qs[:,-1], test_size=0.4, random_state=0)
    earth  = Earth()
    lsnr = PR(earth)
    lsnr.fit(X_train, y_train)
    y_hat = np.array([float(lsnr.get_model(x.reshape(1,-1)).predict(x.reshape(1,-1))) for x in X_test])
    r2 = metrics.r2_score(y_test,y_hat)
    kl = kl_divergence_error(y_test, y_hat)
    nrmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2, nrmse, kl))
    #Linear Regression comparsion
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_hat_lr = lr.predict(X_test)
    r2_lr = metrics.r2_score(y_test, y_hat_lr)
    kl_lr = kl_divergence_error(y_test, y_hat_lr)
    nrmse_lr = np.sqrt(metrics.mean_squared_error(y_test, y_hat_lr))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2_lr, kl_lr, nrmse_lr))
    dic = {}
    dic['LPM' ]= [('r2',r2), ('kl',kl), ('nrmse',nrmse)]
    dic['LR'] = [('r2',r2_lr), ('kl',kl_lr), ('nrmse',nrmse_lr)]
    #Polynomial regression comparsion
    for count, degree in enumerate(np.arange(3,10,2)):
         model = make_pipeline(PolynomialFeatures(degree), Ridge())
         model.fit(X_train, y_train)
         y_hat = model.predict(X_test)
         r2_p = metrics.r2_score(y_test,y_hat)
         kl_p = kl_divergence_error(y_test, y_hat)
         nrmse_p = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
         dic["LR ({})".format(degree)] = [('r2',r2_p), ('kl',kl_p), ('nrmse',nrmse_p)]
         print("R2 for degree {} : {}".format(degree, metrics.r2_score(y_test, y_hat)))
    logger.info("==============================================")
    with open('output/Accuracy/multiple_methods_higgs.pkl', 'wb') as handle:
        pickle.dump(dic, handle)
    10 * numpy.random.normal(size=m)
y2 = 100 * \
    (numpy.cos((X[:, 5] + X[:, 6]) / 20) - 4.0) + \
    10 * numpy.random.normal(size=m)
y = numpy.concatenate([y1[:, None], y2[:, None]], axis=1)
missing = numpy.random.binomial(1, .2, (m, n)).astype(bool)
X_full = X.copy()
X[missing] = None
idx5 = (1 - missing[:, 5]).astype(bool)
idx6 = (1 - missing[:, 6]).astype(bool)

# Fit an Earth model
model = Earth(max_degree=5,
              minspan_alpha=.5,
              allow_missing=True,
              enable_pruning=True,
              thresh=.001,
              smooth=True,
              verbose=True)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
fig = plt.figure()

for j in [0, 1]:
    ax1 = fig.add_subplot(3, 4, 1 + 2 * j)
Example #11
0
def generate_data():
    alpha1 = 0
    beta1 = 1

    n_samples = 50
    noise1 = 2 * np.random.randn(n_samples)
    x1 = np.linspace(1, 50, 50)
    y1 = alpha1 + beta1 * x1 + noise1

    alpha2 = (alpha1 + beta1 * 50) * 2
    beta2 = -1

    n_samples = 50
    noise2 = 2 * np.random.randn(n_samples)
    x2 = np.linspace(50, 100, 50)
    y2 = alpha2 + beta2 * x2 + noise2

    x = np.concatenate((x1, x2), axis=None)
    y = np.concatenate((y1, y2), axis=None)

    return x, y


x, y = generate_data()
mars = Earth(max_terms=2)
mars.fit(x, y)
Y_hat = mars.predict(x)
plt.scatter(x, y)
plt.scatter(x, Y_hat)
plt.show()
Example #12
0
def getModel(config, modelname):
    info("Getting {0} Model".format(modelname), ind=0)

    problemType = config['problem']
    modelData = getModelData(config, modelname)
    modelParams = modelData.get('params')
    retval = None

    ###########################################################################
    # Classification
    ###########################################################################
    if isClassification(problemType):
        if modelname == "logistic":
            retval = classifier(modelname, LogisticRegression(), modelParams)
        if modelname == "sgd":
            retval = classifier(modelname, SGDClassifier(), modelParams)
        if modelname == "passagg":
            retval = classifier(modelname, PassiveAggressiveClassifier(),
                                modelParams)

        if modelname == "mlp":
            retval = classifier(modelname, MLPClassifier(), modelParams)

        if modelname == "xgboost":
            retval = classifier(modelname, XGBClassifier(), modelParams)

        if modelname == "gaussproc":
            retval = classifier(modelname, GaussianProcessClassifier(),
                                modelParams)

        if modelname == "lda":
            retval = classifier(modelname, LinearDiscriminantAnalysis(),
                                modelParams)
        if modelname == "qda":
            retval = classifier(modelname, QuadraticDiscriminantAnalysis(),
                                modelParams)

        if modelname == "nb":
            retval = classifier(modelname, GaussianNB(), modelParams)
        if modelname == "nbbern":
            retval = classifier(modelname, BernoulliNB(), modelParams)
        if modelname == "nbmulti":
            retval = classifier(modelname, MultinomialNB(), modelParams)

        if modelname == "dtree":
            retval = classifier(modelname, DecisionTreeClassifier(),
                                modelParams)

        if modelname == "kneighbors":
            retval = classifier(modelname, KNeighborsClassifier(), modelParams)
        if modelname == "rneighbors":
            retval = classifier(modelname, RadiusNeighborsClassifier(),
                                modelParams)

        if modelname == "svmlin":
            retval = classifier(modelname, LinearSVC(), modelParams)
        if modelname == "svmnupoly":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnulinear":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnusigmoid":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnurbf":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmepspoly":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepslinear":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepssigmoid":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepsrbf":
            retval = classifier(modelname, SVC(), modelParams)

        if modelname == "rf":
            retval = classifier(modelname, RandomForestClassifier(),
                                modelParams)
        if modelname == "extratrees":
            retval = classifier(modelname, ExtraTreesClassifier(), modelParams)
        if modelname == "adaboost":
            retval = classifier(modelname, AdaBoostClassifier(), modelParams)
        if modelname == "gbm":
            retval = classifier(modelname, GradientBoostingClassifier(),
                                modelParams)

        if modelname == "tpot":
            retval = classifier(modelname, TPOTClassifier(), modelParams)

        #######################################################################
        # Regression
        #######################################################################
        if modelname == "lightning":
            retval = external.extlightning.createLightningClassifier(
                modelParams)

    ###########################################################################
    # Regression
    ###########################################################################
    if isRegression(problemType):
        if modelname == "linear":
            retval = classifier(modelname, LinearRegression(), modelParams)
        if modelname == "ridge":
            retval = classifier(modelname, Ridge(), modelParams)
        if modelname == "lasso":
            retval = classifier(modelname, Lasso(), modelParams)
        if modelname == "elasticnet":
            retval = classifier(modelname, ElasticNet(), modelParams)
        if modelname == "omp":
            retval = classifier(modelname, OrthogonalMatchingPursuit(),
                                modelParams)
        if modelname == "bayesridge":
            retval = classifier(modelname, BayesianRidge(), modelParams)
        if modelname == "ard":
            retval = classifier(modelname, ARDRegression(), modelParams)
        if modelname == "sgd":
            retval = classifier(modelname, SGDRegressor(), modelParams)
        if modelname == "passagg":
            retval = classifier(modelname, PassiveAggressiveRegressor(),
                                modelParams)
        if modelname == "perceptron":
            retval = None
        if modelname == "huber":
            retval = classifier(modelname, HuberRegressor(), modelParams)
        if modelname == "theilsen":
            retval = classifier(modelname, TheilSenRegressor(), modelParams)
        if modelname == "ransac":
            retval = classifier(modelname, RANSACRegressor(), modelParams)

        if modelname == "mlp":
            retval = classifier(modelname, MLPRegressor(), modelParams)

        if modelname == "xgboost":
            retval = classifier(modelname, XGBRegressor(), modelParams)

        if modelname == "gaussproc":
            retval = classifier(modelname, GaussianProcessRegressor(),
                                modelParams)

        if modelname == "dtree":
            retval = classifier(modelname, DecisionTreeRegressor(),
                                modelParams)

        if modelname == "kneighbors":
            retval = classifier(modelname, KNeighborsRegressor(), modelParams)
        if modelname == "rneighbors":
            retval = classifier(modelname, RadiusNeighborsRegressor(),
                                modelParams)

        if modelname == "svmlin":
            retval = classifier(modelname, LinearSVR(), modelParams)
        if modelname == "svmnupoly":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnulinear":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnusigmoid":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnurbf":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmepspoly":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepslinear":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepssigmoid":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepsrbf":
            retval = classifier(modelname, SVR(), modelParams)

        if modelname == "rf":
            retval = classifier(modelname, RandomForestRegressor(),
                                modelParams)
        if modelname == "extratrees":
            retval = classifier(modelname, ExtraTreesRegressor(), modelParams)
        if modelname == "adaboost":
            retval = classifier(modelname, AdaBoostRegressor(), modelParams)
        if modelname == "gbm":
            retval = classifier(modelname, GradientBoostingRegressor(),
                                modelParams)

        if modelname == "isotonic":
            retval = classifier(modelname, IsotonicRegression(), modelParams)

        if modelname == "earth":
            retval = classifier(modelname, Earth(), modelParams)

        if modelname == "symbolic":
            retval = classifier(modelname, SymbolicRegressor(), modelParams)

        if modelname == "tpot":
            retval = classifier(modelname, TPOTRegressor(), modelParams)

    if retval is None:
        raise ValueError(
            "No model with name {0} was created".format(modelname))

    model = retval.get()

    return model
from pyearth import Earth
import joblib
y_s = 0
y_s = sys.argv[1]
n = y_s
ans = 0
for i in range(len(n)):
    if n[i].isnumeric():
        ans = ans + int(n[i]) * pow(10, (len(n) - 1) - i)
gdpdata = pd.read_csv(
    "/home/cheeryluck/PycharmProjects/djangoProject2/data1/IndiaGDP.csv",
    header=None)
labels = ['Year', 'GDP']
gdpdata.columns = labels

train, test = train_test_split(gdpdata)

model6 = Earth().fit(train.iloc[:, :1], train.iloc[:, 1:])
ycap6 = model6.predict(test.iloc[:, :1])

error = mean_squared_error(test.iloc[:, :1], ycap6)

model6.predict([[2019]])
joblib.dump(
    model6,
    '/home/cheeryluck/PycharmProjects/djangoProject2/data1/GDP_Model.sav')
impmodel = joblib.load(
    '/home/cheeryluck/PycharmProjects/djangoProject2/data1/GDP_Model.sav')

print(impmodel.predict([[2019]]))
Example #14
0
total_data = pd.concat([
    total_category_data,
    total_numeric_data.clip(total_numeric_data.quantile(0.01).to_dict(),
                            total_numeric_data.quantile(0.99).to_dict(),
                            axis=1)
],
                       axis=1)
print(total_data.shape)
total_data = total_data.fillna(total_data.mean())

print(total_data.head(5))

train_data = total_data[total_data.index < 1460]
test_data = total_data[total_data.index >= 1460]

rfe = RFE(Earth(), step=15, verbose=2).fit(train_data, train_Y)
validKeys = list(train_data.columns[rfe.support_])

train_data = train_data[validKeys]
test_data = test_data[validKeys]

model = Earth().fit(train_data, train_Y)
predict = model.predict(test_data)
predict = np.exp(predict)

submission = pd.DataFrame()
submission['Id'] = test_index
submission['SalePrice'] = predict
submission.to_csv(
    "C:\\Users\\hongj\\Desktop\\kaggle\\house_price\\submission.csv",
    index=False)
import matplotlib.pyplot as plt
from pyearth import Earth

np.random.seed(1)
m = 1000
n = 5

X = np.random.normal(size=(m, n))

# Make X[:,1] binary
X[:, 1] = np.random.binomial(1, .5, size=m)

# The response is a linear function of the inputs
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.normal(size=m)

# Fit the earth model
model = Earth().fit(X, y)

# Print the model summary, showing linear terms
print model.summary()

# Plot for both values of X[:,1]
y_hat = model.predict(X)
plt.figure()
plt.plot(X[:, 0], y, 'k.')
plt.plot(X[X[:, 1] == 0, 0], y_hat[X[:, 1] == 0], 'r.', label='$x_1 = 0$')
plt.plot(X[X[:, 1] == 1, 0], y_hat[X[:, 1] == 1], 'b.', label='$x_1 = 1$')
plt.legend(loc='best')
plt.xlabel('$x_0$')
plt.show()
Example #16
0
X = np.array(X)
y = np.sin(X) + np.random.normal(size=X.shape[0])/10.0  

#Defining different knots which will be used as a parameter for MARS model
knots = [2,4,5,10]

#Helpful in creating graph
axis = [[0,0],[0,1],[1,0],[1,1]]

#Defining different max_degree parameter for MARS model parameter
for degree in range(1,5):
  fig,ax = plt.subplots(2,2,figsize=(10, 10))

  for num_knot in range(4):
    # Defining MARS model with max_term and max_degree parameter
    model = Earth(max_terms=knots[num_knot],max_degree=degree,verbose=0)
    
    #Fitting the dataset on the dataset
    model.fit(X, y)

    #Prediction model output
    y_hat = model.predict(X)

    #Potting graphs
    ax[axis[num_knot][0],axis[num_knot][1]].title.set_text(f"degree = {degree}, knots = {knots[num_knot]}")
    ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y,'r.')
    ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y_hat,'b.')
  plt.show()

# Plotting dataset distribution
plt.figure()
Example #17
0
A simple example plotting a fit of the absolute value function.
"""

import numpy
import matplotlib.pyplot as plt

from pyearth import Earth

# Create some fake data
numpy.random.seed(2)
m = 1000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth(max_degree=1)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
plt.figure()
plt.plot(X[:, 6], y, 'r.')
plt.plot(X[:, 6], y_hat, 'b.')
plt.show()
Example #18
0
st = 'CPY012'
target,start_p,stop_p,host_path=station_sel(st,mode)
if mode =='hour': n_past,n_future = 24*7,72
elif mode =='day': n_past,n_future = 60,30

data = df[start_p:stop_p]
data['Day'] = data.index.dayofyear #add day
data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() 

conclude_df=pd.DataFrame()
for n_out in range(1,n_future+1):
    X,y,xlabels = to_supervise(data,target,n_out)
    criteria = ('rss', 'gcv', 'nb_subsets')
    model = Earth(enable_pruning = True,
                #   max_degree=3,
                #  max_terms=20,
                minspan_alpha=.5,
                feature_importance_type=criteria,
                verbose=True)
    model.fit(X,y,xlabels=xlabels)
    nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83]
    gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83]
    rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83]
    
    rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub)
    top20=pd.concat([rss,gcv,nbsub],ignore_index=True)
    top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature')
    top20['timestep'] = n_out

    #ADDED combine all result
    conclude_df = pd.concat([conclude_df,top20],ignore_index=True)
    if mode=='day':
Example #19
0
A simple example plotting a fit of the sine function.
"""
import numpy
import matplotlib.pyplot as plt

from pyearth import Earth

# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
y = 100 * \
    numpy.abs(numpy.sin((X[:, 6]) / 10) - 4.0) + \
    10 * numpy.random.normal(size=m)

# Fit an Earth model
model = Earth(max_degree=3, minspan_alpha=.5)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
plt.plot(X[:, 6], y, 'r.')
plt.plot(X[:, 6], y_hat, 'b.')
plt.show()
Example #20
0
from matplotlib import pyplot
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from pyearth import Earth
from matplotlib import pyplot

df = pd.read_excel('relay-foods.xlsx', sheetname='Purchase Data - Full Study')
df['OrderId'] = df['OrderId'].astype('category')
df['CommonId'] = df['CommonId'].astype('category')

df['OrderId'] = df['OrderId'].astype('category')
df['CommonId'] = df['CommonId'].astype('category')
df.dtypes
col_names = ['OrderDate', 'PickupDate']
df = df.drop(col_names, axis=1)
y = df['TotalCharges']
df_2 = df[['OrderId', 'UserId', 'PupId']]
#del df['OrderDate']
X = [dict(r.iteritems()) for _, r in df_2.iterrows()]
train_fea = DictVectorizer().fit_transform(X)

#Fit an Earth model
model = Earth()
model.fit(train_fea, y)

#Print the model
print(model.trace())
print(model.summary())

#Plot the model
y_hat = model.predict(X)
def train(object_name,
          data_dir,
          output_dir,
          train_type,
          classifier_type,
          learned_model=None,
          debug=False):
    from sklearn import linear_model, tree
    from sklearn.svm import SVR
    from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostRegressor
    if classifier_type == 'Earth':
        from pyearth import Earth
    import numpy as np
    have_graphviz = True
    try:
        import graphviz
    except:
        have_graphviz = False
    ans = None
    saso_data = load_data_file(object_name, data_dir)
    if train_type == 'gripper_status':
        action_str = 'gs'
        actions = range(CLOSE_ACTION_ID + 1)
        x = []
        y = []
        x_index = []
        for action in actions:
            for sasor in saso_data[action]:
                #x_entry = sasor['touch_prev'] + sasor['init_joint_values']
                x_entry = sasor['next_joint_values']
                x_entry = x_entry + sasor['next_gripper'] + sasor['next_object']
                x_entry.append(sasor['next_object'][0] -
                               sasor['next_gripper'][0])
                x_entry.append(sasor['next_object'][1] -
                               sasor['next_gripper'][1])
                x.append(x_entry)
                x_index.append(sasor['index'])
                if action == CLOSE_ACTION_ID:
                    y.append(1)
                else:
                    y.append(0)  #gripper open
    if train_type == 'pick_success_probability':
        action_str = repr(PICK_ACTION_ID)
        x = []
        y = []
        x_index = []
        for sasor in saso_data[PICK_ACTION_ID]:
            #x_entry = sasor['touch_prev'] + sasor['init_joint_values']
            x_entry = sasor['init_joint_values']
            x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[
                'init_object'][0:3]
            x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0])
            x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1])
            x.append(x_entry)
            x_index.append(sasor['index'])
            if sasor['reward'] > 0:
                y.append(1)
            else:
                y.append(0)
    if train_type in ['pick_success_probability', 'gripper_status']:
        if learned_model is not None:
            logistic = learned_model
        else:
            print classifier_type
            if classifier_type == 'DTC':
                logistic = DecisionTreeClassifier(criterion='entropy')
            else:
                logistic = linear_model.LogisticRegression(max_iter=400, C=1.0)
            logistic.fit(x, y)
            joblib.dump(
                logistic,
                output_dir + '/' + classifier_type + '-' + action_str + '.pkl')
        ans = logistic
        print logistic.score(x, y)
        print logistic.get_params()
        print len(x)
        if classifier_type != 'DTC':
            print logistic.coef_
            print logistic.intercept_
            yaml_out = {}
            yaml_out['coef'] = logistic.coef_.tolist()[0]
            yaml_out['intercept'] = logistic.intercept_.tolist()[0]
            write_config_in_file(
                output_dir + '/' + classifier_type + '-' + action_str +
                ".yaml", yaml_out)
        else:
            print logistic.feature_importances_

            #feature_names=['t1','t2', 'j1', 'j2']
            feature_names = [
                'j1', 'j2'
            ]  #Touch not required when object coordinates are known
            feature_names = feature_names + [
                'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw'
            ][0:3]
            feature_names = feature_names + [
                'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow'
            ][0:3]
            feature_names = feature_names + ['xrel', 'yrel']
            if have_graphviz:
                dot_data = tree.export_graphviz(logistic,
                                                out_file=None,
                                                feature_names=feature_names,
                                                filled=True)
                graph = graphviz.Source(dot_data)
                graph.render(output_dir + '/' + classifier_type + '-' +
                             action_str)
            yaml_out = {}
            yaml_out["max_depth"] = logistic.tree_.max_depth
            yaml_out["values"] = logistic.tree_.value
            yaml_out['n_nodes'] = logistic.tree_.node_count
            yaml_out['children_left'] = logistic.tree_.children_left
            yaml_out['children_right'] = logistic.tree_.children_right
            yaml_out['feature'] = logistic.tree_.feature
            yaml_out['threshold'] = logistic.tree_.threshold
            write_config_in_file(
                output_dir + '/' + classifier_type + '-' + action_str +
                ".yaml", yaml_out)
        if debug:
            for i in range(0, len(x)):
                y_bar = logistic.predict([x[i]])
                if y_bar != y[i]:
                    print x_index[i]
                    print x[i]
                    print y[i]
                    print logistic.predict_proba([x[i]])
                    if classifier_type != 'DTC':
                        print logistic.decision_function([x[i]])
                        prob = (np.dot(logistic.coef_[0], x[i]) +
                                logistic.intercept_[0])
                        print prob
                        prob *= -1
                        prob = np.exp(prob)
                        prob += 1
                        prob = np.reciprocal(prob)
                        print prob
    if 'next_state' in train_type:
        actions = range(10)

        #  predictions can be 18, 7 for gripper pose, 7 for objct pose
        # 2 for joint values
        # 2 for touch values
        predictions = range(NUM_PREDICTIONS)

        train_type_array = train_type.split('_')
        for s in train_type_array:
            if 'action' in s:
                actions = s.split('-')[1:]
            if 'pred' in s:
                predictions = s.split('-')[1:]
        ans = {}
        for action_ in actions:
            action = int(action_)
            x = []
            y = []
            y_c = []
            l_reg = []
            l_reg_c = []
            x_index = []
            for i in range(0, NUM_PREDICTIONS):
                y.append([])
                y_c.append([])
                l_reg.append('')
                l_reg_c.append('')
            for sasor in saso_data[action]:
                if sasor['reward'] > -999:  #discard invalid states
                    x_entry = sasor['init_joint_values']
                    x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[
                        'init_object'][0:3]
                    x_entry.append(sasor['init_object'][0] -
                                   sasor['init_gripper'][0])
                    x_entry.append(sasor['init_object'][1] -
                                   sasor['init_gripper'][1])
                    x.append(x_entry)
                    x_index.append(sasor['index'])
                    for p_ in predictions:
                        p = int(p_)
                        y[p].append(get_prediction_value(sasor, p))
                        y_default = get_default_value(sasor, p)
                        y_c[p].append(is_correct(p, y[p][-1], y_default))
                        """
                        try:
                            check_array(x)
                            check_array(y[p])
                        except:
                            print x[-1]
                            print y[p][-1]
                            print sasor['index']
                            assert(0==1)
                        """

            print len(x)
            ans[action] = {}

            for p_ in predictions:
                p = int(p_)
                if learned_model is not None:
                    l_reg[p] = learned_model[action][p]
                else:
                    if classifier_type == 'ridge':
                        l_reg[p] = linear_model.Ridge(alpha=0.5,
                                                      normalize=True)
                    elif classifier_type == 'SVR':
                        l_reg[p] = SVR(epsilon=0.2)
                    elif classifier_type in ['DTR', 'DTRM']:
                        l_reg[p] = DecisionTreeRegressor()
                    elif classifier_type == 'DTC':
                        l_reg[p] = DecisionTreeClassifier()
                    elif classifier_type == 'Earth':
                        l_reg[p] = Earth()
                    elif classifier_type == 'AdaLinear':
                        l_reg[p] = AdaBoostRegressor(
                            linear_model.LinearRegression())
                    else:
                        l_reg[p] = linear_model.LinearRegression()
                    if classifier_type == 'DTRM':
                        l_reg[p].fit(x, np.transpose(np.array(y)))
                    elif classifier_type == 'DTC':
                        l_reg[p].fit(x, y_c[p])
                    else:
                        l_reg[p].fit(x, y[p])
                    joblib.dump(
                        l_reg[p], output_dir + '/' + classifier_type + "-" +
                        repr(action) + "-" + repr(p) + '.pkl')
                ans[action][p] = l_reg[p]

                if classifier_type == 'DTRM':
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, np.transpose(np.array(y))))
                elif classifier_type == 'DTC':
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, y_c[p]))
                else:
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, y[p]))
                print l_reg[p].get_params()
                if classifier_type not in [
                        'SVR', 'DTR', 'DTRM', 'AdaLinear', 'DTC'
                ]:
                    print l_reg[p].coef_
                if classifier_type not in [
                        'DTR', 'DTRM', 'AdaLinear', 'DTC', 'Earth'
                ]:
                    print l_reg[p].intercept_
                if classifier_type in ['Earth']:
                    for j in range(0, len(x)):
                        predict_earth(l_reg[p], x[j])
                    print l_reg[p].summary()
                if learned_model is None:
                    if classifier_type in ['DTR', 'DTRM', 'AdaLinear', 'DTC']:

                        print l_reg[p].feature_importances_

                        feature_names = ['j1', 'j2']
                        feature_names = feature_names + [
                            'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw'
                        ][0:3]
                        feature_names = feature_names + [
                            'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow'
                        ][0:3]
                        feature_names = feature_names + ['xrel', 'yrel']
                        if have_graphviz:
                            dot_data = tree.export_graphviz(
                                l_reg[p],
                                out_file=None,
                                feature_names=feature_names,
                                filled=True)
                            graph = graphviz.Source(dot_data)
                            graph.render(output_dir + '/' + classifier_type +
                                         "-" + repr(action) + "-" + repr(p))
                        yaml_out = {}
                        yaml_out['max_depth'] = l_reg[p].tree_.max_depth
                        yaml_out["values"] = l_reg[p].tree_.value.tolist()
                        yaml_out['n_nodes'] = l_reg[p].tree_.node_count
                        yaml_out['children_left'] = l_reg[
                            p].tree_.children_left.tolist()
                        yaml_out['children_right'] = l_reg[
                            p].tree_.children_right.tolist()
                        yaml_out['feature'] = l_reg[p].tree_.feature.tolist()
                        yaml_out['threshold'] = l_reg[
                            p].tree_.threshold.tolist()
                        write_config_in_file(
                            output_dir + '/' + classifier_type + "-" +
                            repr(action) + "-" + repr(p) + ".yaml", yaml_out)
                    if classifier_type in ['Earth']:
                        yaml_out = get_yaml_earth(l_reg[p])
                        write_config_in_file(
                            output_dir + '/' + classifier_type + "-" +
                            repr(action) + "-" + repr(p) + ".yaml", yaml_out)

                if classifier_type == 'DTRM':
                    i = 0
                    y_bar = l_reg[p].predict([x[i]])
                    print x_index[i]
                    print x[i]
                    y_t = np.transpose(np.array(y))
                    print repr(y_t[i]) + ' Prediction ' + repr(y_bar)
                    break
                if debug:
                    for i in range(0, len(x)):
                        y_bar = l_reg[p].predict([x[i]])
                        if classifier_type == 'DTC':
                            if y_bar != y_c[p][i]:
                                print x_index[i]
                                print x[i]
                                print y_c[p][i]
                                print y[p][i]
                                print l_reg[p].predict_proba([x[i]])
                        else:
                            if is_correct(p, y_bar, y[p][i]) == 0:
                                print x_index[i]
                                print x[i]
                                print repr(
                                    y[p][i]) + ' Prediction ' + repr(y_bar)

    return ans
Example #22
0
def test_score():
    earth = Earth(**default_params)
    model = earth.fit(X, y)
    record = model.pruning_trace()
    rsq = record.rsq(record.get_selected())
    assert_almost_equal(rsq, model.score(X, y))
Example #23
0
y = pm.reshape(-1, 1)
print y.shape
#获得y的统计信息
#statistics(y)
##频率分布图
#drawHist(y,'PM2.5','Frequency','the Frequency of PM2.5')
##频率累计图
#drawCumulativeHist(y,'PM2.5','Frequency','Curve cumulative of PM2.5')
##箱图
#drawBox(y,'PM2.5','BOX of PM2.5')
###########################################################

#MARS拟合
#1)Fit an Earth model
criteria = ('rss', 'gcv', 'nb_subsets')
model = Earth(max_degree=2, feature_importance_type=criteria)
model.fit(X, y)  #这里用的是标准化之后的数据
#2)Print the model模型结果

print(model.trace())
print(model.summary())
print(model.summary_feature_importances(sort_by='gcv'))
#3)预测的y

y_hat = model.predict(X)

#评价指标
#R_2=R2((y_hat.reshape(-1,1)),(y.reshape(-1,1)))
R_square = metrics.r2_score((y.reshape(-1, 1)),
                            (y_hat.reshape(-1, 1)))  #计算r2,来表示y与拟合y_hat的接近程度
RMSE = sqrt(metrics.mean_squared_error(y.reshape(-1, 1), y_hat.reshape(-1, 1)))
Example #24
0
def test_export_python_function():
    for smooth in (True, False):
        model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y)
        export_model = export_python_function(model)
        for exp_pred, model_pred in zip(model.predict(X), export_model(X)):
            assert_almost_equal(exp_pred, model_pred)
X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train[
    '2ndFlrSF']
X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test[
    '2ndFlrSF']

#normality check for the target
ax = sns.distplot(y_train)
plt.show()

#log transform the dependent variable for normality
y_train = np.log(y_train)
ax = sns.distplot(y_train)
plt.show()

#mars solution
model = Earth()

model = Earth(
    max_degree=2,
    penalty=1.0,
    minspan_alpha=0.01,
    endspan_alpha=0.01,
    endspan=5
)  #2nd degree formula is necessary to see interactions, penalty and alpha values for making model simple

model.fit(X_train, y_train)
model.score(X_train, y_train)
#y_pred = model.predict(train["SalePrice"])

y_pred = model.predict(X_test)
y_pred = np.exp(y_pred)  # inverse log transform the results
    )

    # Select target and feature dataset(s) --> [target, feature1, feature2, ... ]
    datasets = [
        Dataset('runoff', database),
        Dataset('runoff', database).normalized(),
        Dataset('temp', database).normalized(),
        Dataset('precip', database).normalized(),
        Dataset('season', database).normalized()
    ]

    # Select leadtimes for target and feature. negative:past/positive:future
    leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]]

    # Select Model
    model_type = Earth(max_degree=10, smooth=True)
    #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000)
    #model_type = Regressor(
    #    layers=[
    #        Layer("Sigmoid",units=5),
    #        Layer("Linear", units=1)],
    #    learning_rate=0.1,
    #    n_iter=1000)

    # Set training interval
    startyear = DateFormat(1900, 1)
    endyear = DateFormat(2005, 36)
    training_daterange = DateFormat.decadal_daterange(startyear, endyear)

    # Set testing interval
    startyear = DateFormat(2006, 1)
Example #27
0
y1 = 120 * np.abs(np.sin((X[:, 6]) / 6) - 1.0) + 15 * np.random.normal(size=m)
y2 = 120 * np.abs(np.sin((X[:, 5]) / 6) - 1.0) + 15 * np.random.normal(size=m)

y1 = (y1 - y1.mean()) / y1.std()
y2 = (y2 - y2.mean()) / y2.std()
y_mix = np.concatenate((y1[:, np.newaxis], y2[:, np.newaxis]), axis=1)

alphas = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1]
n_plots = len(alphas)
k = 1
fig = plt.figure(figsize=(10, 15))
for i, alpha in enumerate(alphas):
    # Fit an Earth model
    model = Earth(max_degree=5,
                  minspan_alpha=.05,
                  endspan_alpha=.05,
                  max_terms=10,
                  check_every=1,
                  thresh=0.)
    output_weight = np.array([alpha, 1 - alpha])
    model.fit(X, y_mix, output_weight=output_weight)
    print(model.summary())

    # Plot the model
    y_hat = model.predict(X)

    mse = ((y_hat - y_mix) ** 2).mean(axis=0)
    ax = plt.subplot(n_plots, 2, k)
    ax.set_ylabel("Run {0}".format(i + 1), rotation=0, labelpad=20)
    plt.plot(X[:, 6], y_mix[:, 0], 'r.')
    plt.plot(X[:, 6], model.predict(X)[:, 0], 'b.')
    plt.title("MSE: {0:.3f}, Weight : {1:.1f}".format(mse[0], alpha))