Esempio n. 1
0
  def svm_regression(self, out_filename, delete=True):
    self.to_svmlight(out_filename)
    
    train_file = out_filename+".train"
    test_file = out_filename+".val"
    model_file = train_file+".mod"
    classified_file = test_file+".class"
    classified_file_original = test_file+".class_orig"
    
    print "Writing output to " + self.base_file+"_svm.txt"
    with open(self.base_file+"_svm.txt", 'w') as fx:
      d = 1
      for t in [0]: #[0, 1, 1, 1, 1, 1, 2]:
        print "---"
        print "SVM Regression..."
        if t != 1:
          MSVMLight.learn(train_file, model_file, z='r', t=t)
        else:
          MSVMLight.learn(train_file, model_file, z='r', t=t, d=d)
          d += 1
        print "Learnt Model"
        MSVMLight.classify(test_file, model_file, classified_file)
        MSVMLight.classify(train_file, model_file, classified_file_original)
    
        with open(test_file, 'r') as f:
          ytrue = f.readlines()
          ytrue = [float(l.split(' ', 1)[0]) for l in ytrue]
    
        with open(classified_file, 'r') as f:
          yguess = f.readlines()
          yguess = [float(l.replace('\n', '')) for l in yguess]
        
        with open(train_file, 'r') as f:
          ytrue_orig = f.readlines()
          ytrue_orig = [float(l.split(' ', 1)[0]) for l in ytrue_orig]

        with open(classified_file_original, 'r') as f:
          yguess_orig = f.readlines()
          yguess_orig = [float(l.replace('\n', '')) for l in yguess_orig]
        
        fx.write("---t=%d d=%d\n" % (t, d))
        fx.write("R2 (val/test)\n")
        fx.write("%f %f \n" % (metrics.r2_score(ytrue, yguess), metrics.r2_score(ytrue_orig, yguess_orig)))
        fx.write("MSE (val/test)\n")
        fx.write("%f %f \n" % (metrics.mean_square_error(ytrue, yguess), metrics.mean_square_error(ytrue_orig, yguess_orig)))
        fx.write("---\n")
Esempio n. 2
0
def test_all_regressors():
    x, y  = make_friedman2(10000)
    x_train, y_train, x_test, y_test = test_helpers.split_dataset(x,y)
    #print y_test[:100]
    ols = LinearRegression()
    ols.fit(x_train, y_train)
    ols_pred = ols.predict(x_test)
    #print ols_pred[:100]
    ols_mse = mean_square_error(y_test, ols_pred)
    
    for fn in regressors:
        
        print fn
        model = fn(x_train,y_train)
        print model 
        pred = model.predict(x_test)
        #print pred[:100]
        mse = mean_square_error(y_test, pred)
        
        print "OLS MSE:", ols_mse, " Current MSE:", mse
        print "Ratio:",  mse / ols_mse 
        assert ols_mse > 1.1*mse
Esempio n. 3
0
def test_all_regressors():
    x, y = make_friedman2(10000)
    x_train, y_train, x_test, y_test = test_helpers.split_dataset(x, y)
    #print y_test[:100]
    ols = LinearRegression()
    ols.fit(x_train, y_train)
    ols_pred = ols.predict(x_test)
    #print ols_pred[:100]
    ols_mse = mean_square_error(y_test, ols_pred)

    for fn in regressors:

        print fn
        model = fn(x_train, y_train)
        print model
        pred = model.predict(x_test)
        #print pred[:100]
        mse = mean_square_error(y_test, pred)

        print "OLS MSE:", ols_mse, " Current MSE:", mse
        print "Ratio:", mse / ols_mse
        assert ols_mse > 1.1 * mse
Esempio n. 4
0
from sklearn.datasets import load_boston
boston = load_boston()
from matplotlib import pyplot as plt
import scipy as sp
from sklearn.metrics import mean_square_error

plt.figure(1)
plt.hist(boston.target)
plt.xlabel('price ($1000s)')
plt.ylabel('count')

from sklearn.linear_model import LinearRegression

clf = LinearRegression()

clf.fit(boston.data[::2], boston.target[::2])

predicted = clf.predict(boston.data[1::2])
plt.figure(2)
plt.scatter(boston.target[1::2], predicted)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')


print mean_square_error(boston.target[1::2],predicted) 

plt.show() 
data_train, data_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

###################################################

# 3 - define the regression model 
clf = linear_model.LinearRegression()

###################################################

# 4 - fit the model
clf.fit (data_train, y_train)
clf.coef_

# 5 - predict using the model
y_predicted = clf.predict(y_train)

# 6 - validata the model
print(metrics.explained_variance_score(y_test, y_predicted))  #ES = SSR/SST
print(metrics.mean_absolute_error(y_test, y_predicted))       #MAE (l1)
print(metrics.mean_square_error(y_test, y_predicted))         #MSE (l2)
print(metrics.r2_score(y_test, y_predicted))                  #R2 = 1-SSE/SST

# 7 - print the result
import matplotlib.pyplot as plt
plt.scatter(data_test, y_test,  color='black')
plt.plot(data_test, y_predicted), color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
Esempio n. 6
0
def TRVP(Ym,Ypred): return mean_square_error(Ym,Ypred) / len(Ym)

def correcting(m): return [ v+v*0.01*a for a,v in enumerate(m)]
Esempio n. 7
0
def EVRP(Ym,Ypred): return mean_square_error(Ym,Ypred) / norm(Ym)**2

plot_metrics('EVRP','ERVP',apply_metrics(EVRP))
Esempio n. 8
0
roc_values = []
for feature in X_train.columns:
	clf = DecisionTreeClassifier()
	clf.fit(X_train[feature].to_frame(), y_train)
	y_scored = clf.predict_proba(X_test[feature].to_frame())
	roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))

# 之后rank一下选分数高的就好了

# 8.2 Univariate roc-auc for Regression
mse_values = []
for feature in X_train.columns:
	clf = DecisionTreeRegressor()
	clf.fit(X_train[feature].to_frame(), y_train)
	y_scored = clf.predict(X_test[feature].to_frame())
	mse_values.append(mean_square_error(y_test, y_scored))

	# Rank it!

################################# B. Wrapper Methods ############################################

# 1. Forward Selection: add one feature at a time recursively
# 2. Backward Selection: remove one feature at a time recursively
# 3. Exhaustive Search: searches across all possible feature combinations

## Procedure
# 1. Search for the subset of features
# 2. Build the Machine Learning Model on the selected feature subset
# 3. Evaluate Model Performance
# 4. Repeat
Esempio n. 9
0
            std = X_train.std(axis=0)
            mean = X_train.mean(axis=0)
            X_train = (X_train - mean) / std
            X_test = (X_test - mean) / std

            std = y_train.std(axis=0)
            mean = y_train.mean(axis=0)
            y_train = (y_train - mean) / std
            y_test = (y_test - mean) / std

            gc.collect()
            print "- benching ElasticNet"
            clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False)
            tstart = time()
            clf.fit(X_train, y_train)
            elnet_results[i, j, 0] = mean_square_error(clf.predict(X_test),
                                                       y_test)
            elnet_results[i, j, 1] = time() - tstart

            gc.collect()
            print "- benching SGD"
            n_iter = np.ceil(10 ** 4.0 / n_train)
            clf = SGDRegressor(alpha=alpha, fit_intercept=False,
                               n_iter=n_iter, learning_rate="invscaling",
                               eta0=.01, power_t=0.25)

            tstart = time()
            clf.fit(X_train, y_train)
            sgd_results[i, j, 0] = mean_square_error(clf.predict(X_test),
                                                     y_test)
            sgd_results[i, j, 1] = time() - tstart
Esempio n. 10
0
# <codecell>

permdata = np.load('100iter.npz')
hist(permdata['distribution'], 64, color=[0.6,0.6,0.6])
plot([permdata['value'], permdata['value']], [0, 12], color='r', linewidth=2)
title('p = %.3f' % max(1./100, (1-permdata['pvalue'])))
xlim([390, 1100])
xlabel('Mean square error (lower=better)')
savefig("figures/permtest_hist.svg")
savefig("figures/permtest_hist.png", dpi=600)

# <codecell>

msedata = []
for idx, res in enumerate(result_lsas):
    msedata.append((skm.mean_square_error(res[0], res[1]), 
                    skm.mean_square_error(cvres['result'][idx][0],
                                          cvres['result'][idx][1])))

# <codecell>

print wilcoxon(np.diff(msedata, axis=1).ravel())
boxplot(np.diff(msedata, axis=1))

# <markdowncell>

# ##Amygdala responses

# <codecell>

amygdata = recfromcsv('AmygdalaResponses.csv', names=True)
Esempio n. 11
0
 def regression(self, type='PCA'):
   self.lm = linear_model.LinearRegression()
   if type == 'PCA':
     X = self.Xt
     Xv = self.Xvt
   else:
     X = self.X
     Xv = self.Xv
   self.lm.fit(X, self.Y)
   Ypv = self.lm.predict(Xv)
   Yp = self.lm.predict(X)
   print "Writing output to " + self.base_file+"_"+type+".txt"
   with open(self.base_file+"_"+type+".txt", 'w') as f:
     if type == 'Linear':
       f.write("---\n")
       f.write("Linear Components\n")
       for i, x in enumerate(self.select_x):
         
         f.write("%d\t%s\n" % (i,x))
     f.write("---\n")
     f.write("%s Regression...\n" % type)
     f.write("R2 Score (Val / Test) \n")
     f.write("%f %f \n" % (metrics.r2_score(self.Yv, Ypv), metrics.r2_score(self.Y, Yp)))
     f.write("MSE (Val / Test)")
     f.write("%f %f \n" % (metrics.mean_square_error(self.Yv, Ypv), metrics.mean_square_error(self.Y, Yp)))
     f.write("Coefficients\n")
     f.write("%s\n" % self.lm.coef_)
     f.write("Intercept\n")
     f.write("%s\n" % self.lm.intercept_)
     f.write("---\n")      
     # Do R Linear Regression
     lm_string = "y ~ x0"
     data_frame_val = {}
     data_frame_train = {}
     for i in range(len(self.select_x)):
       R.globalenv['x%d' % i] = R.FloatVector(X[:,i].tolist())
       data_frame_val['x%d' % i] = R.FloatVector(Xv[:,i].tolist())
       #data_frame_train['x%d' % i] = R.FloatVector(X[:,i].tolist())
       if i > 0:
         lm_string += " + x%d" % i
     R.globalenv['y'] = R.FloatVector(self.Y)
     data_frame_val['y'] = R.FloatVector(self.Yv)
     #data_frame_train['y'] = R.FloatVector(self.Y)
     data_frame_val = R.DataFrame(data_frame_val)
     #data_frame_train = R.DataFrame(data_frame_train)
     #R.r.attach(data_frame_train)
     fit = R.r.lm(lm_string)
     
     aic = R.r.AIC(fit)
     
     f.write("%s\n" % R.r.summary(fit))
     f.write("%s\n" % aic)
     
     #R.r.attach(data_frame_val)
     
     # Print Test R2 Value
     predicted = R.r.predict(fit)
     YpR = []
     for p in predicted:
       YpR.append(p)
     f.write("Test: %s\n" % metrics.r2_score(self.Y, YpR))
     
     # Print Validation R2 Value
     for i in range(len(self.select_x)):
       R.globalenv['x%d' % i] = R.FloatVector(Xv[:,i].tolist())
     R.globalenv['y'] = R.FloatVector(self.Yv)
     predicted = R.r.predict(fit, newdata=data_frame_val)
     YpvR = []
     for p in predicted:
       YpvR.append(p)
     f.write("Val: %s\n" % metrics.r2_score(self.Yv, YpvR))
     # fit2 = R.r.lm('y ~ x1 + x2 + x3')
     # print R.r.anova(fit, fit2)
     
     #print aic[0]
Esempio n. 12
0
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
y_pred = lr.predict(X_test)
y_pred

#VISUALIZE THE TRAIN RESULTS

plt.scatter(X_train, y_train, color = 'blue')
plt.plot(X_train, lr.predict(X_train), color = 'red')
plt.title('Salary ~ Experience (Train Set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

#VISUALIZE THE TEST RESULTS

plt.scatter(X_test, y_test, color = 'blue')
plt.plot(X_test, lr.predict(X_train), color = 'red')
plt.title('Salary Vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

#CALCULATING THE RESIDUALS
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test,y_pred))
print('MSE:', metrics.mean_square_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_absolute_error(y_test, y_pred)))
Esempio n. 13
0

def hold_out(X):
    train, test = [], []
    for i in X:
        test.append(i) if int(random() * 4) == 0 else train.append(i)
    return (train, test)


read_data = lambda file_name: [
    map(float,
        line.split(' ')[0:-1]) for line in open(file_name, 'r').readlines()
]
data = lambda records: [record[0:-1] for record in records if len(record) == 9]
labels = lambda records: [record[-1] for record in records if len(record) == 9]
match_rate = lambda classifier, data, labels, Paser: mean_square_error(
    labels, [Paser(classifier.predict(i)) for i in data])


def best_grid(train_data, train_labels, validation_data, validation_labels,
              grid, Classifier, Paser):
    best = (sys.maxint, 0)
    for i in grid:
        classifier = Classifier(i)
        classifier.fit(train_data, train_labels)
        matches = match_rate(classifier, validation_data, validation_labels,
                             Paser)
        if matches < best[0]: best = (matches, i)
    return best[1]


train, validation = hold_out(read_data('./data/bank8FM.data'))
Esempio n. 14
0
from sklearn.neighbors import KNeighborsRegressor
from rbf import RBF
from sklearn.metrics import mean_square_error
from numpy import mean
from random import random
import math

def hold_out(X):
    train, test = [], []
    for i in X: test.append(i) if int(random() * 4) == 0 else train.append(i)
    return (train, test)

read_data = lambda file_name: [map(float, line.split(' ')[0:-1]) for line in open(file_name,'r').readlines()]
data = lambda records: [record[0:-1] for record in records if len(record) == 9]
labels = lambda records: [record[-1] for record in records if len(record) == 9]
match_rate = lambda classifier, data, labels, Paser: mean_square_error(labels, [Paser(classifier.predict(i)) for i in data])
def best_grid(train_data, train_labels, validation_data, validation_labels, grid, Classifier, Paser):
    best = (sys.maxint, 0)
    for i in grid:
        classifier = Classifier(i)
        classifier.fit(train_data, train_labels)
        matches = match_rate(classifier, validation_data, validation_labels, Paser)
        if matches < best[0]: best = (matches, i)
    return best[1]

train, validation = hold_out(read_data('./data/bank8FM.data'))
train_data, train_labels = data(train), labels(train)
validation_data, validation_labels = data(validation), labels(validation)

test  = read_data('./data/bank8FM.test')
test_data, test_labels = data(test), labels(test)