def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5):
	model = KNeighborsRegressor(n_neighbors=n_neighbors)
	model.fit(features_train, labels_train)
	labels_pred = model.predict(features_pred)
	score = model.score(features_train, labels_train)
	print "KNeighbors - coefficient of determination R^2 of the prediction: ", score
	return labels_pred
Beispiel #2
0
 def knn(X, Y):
   neigh = KNeighborsRegressor()
   neigh.fit(X, Y)
   def explore(x):
     score = -1 * neigh.predict([x])
     return score
   minimized = differential_evolution(explore, ((0, 1), (0, 1), (0, 1), (0, 1), (0, 1)))
   return {
     'X_min': list(minimized.x),
     'score': neigh.score(X, Y)
   }
class PriceModel(object):
    """Linear Regression Model used to predict future prices"""
    def __init__(self, algorithm='linear_regression'):
        if algorithm == 'knn':
            self.clf = KNeighborsRegressor(n_neighbors=2)
        else:
            self.clf = linear_model.LinearRegression()

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def score(self, X_test, y_test):
        return self.clf.score(X_test, y_test)
    def _random_search(self, random_iter, x, y):
        # Default Values
        n_neighbors = 5
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            n_list = [1, ]
            while n_list[-1]*2 < x.shape[0]/2:
                n_list.append(n_list[-1]*2)
            n_list.extend(range(1,11))
            param_dist = {"n_neighbors": n_list}
            param_list = [{"n_neighbors": n_neighbors}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            for idx, d in enumerate(param_list):
                knn = KNeighborsRegressor(n_neighbors=int(d["n_neighbors"]),
                                          weights='uniform',
                                          algorithm='auto',
                                          leaf_size=30,
                                          p=2,
                                          metric='minkowski')
                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5,
                                     random_state=self._rng)
                knn.fit(train_x, train_y)
                #print test_x.shape, test_y.shape, d
                sc = knn.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    n_neighbors = d['n_neighbors']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using n_neighbors: %d\n" % n_neighbors)
        return n_neighbors
    X = loadtxt(train_file, usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), unpack=True, delimiter=',').T
    Y = loadtxt(train_file, unpack=True, usecols=(11), delimiter=',')

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    print('Data loaded!')
    plot_simple_table(X_train.T[:, :30])

    best_params = find_best_params(X_train, y_train)

    model = KNeighborsRegressor(n_neighbors = best_params['n_neighbors'])
    model.fit(X_train, y_train)

    score = model.score(X_val, y_val)
    preds = model.predict(X_test)

    pred = preds.reshape(len(preds))
    real = y_test

    plot_table(real, pred)
    plot_scatter(X_train, y_train, X_val, y_val, X_test, y_test, preds)

    # Compute the mean squared error of our predictions.
    mse = (((pred - real) ** 2).sum()) / len(pred)

    print('Cross-Val Score:', score)
    print('Mean Squared Error:', mse)

    plot_hm(real, pred)
# get importance
importance_k = results.importances_mean
# summarize feature importance
for i, v in enumerate(importance_k):
    print('Feature: %0d, -> %s Score: %.5f' % (i, carrear_feature_names[i], v))
# plot feature importance
plt.bar([x for x in range(len(importance_k))], importance_k)
pyplot.xticks(np.arange(len(carrear_feature_names)),
              carrear_feature_names,
              rotation='vertical')
plt.show()

# In[33]:

print('Accuracy of knn classifier on training set: {:.2f}'.format(
    knn.score(X_train, y_train)))
print('Accuracy of knn classifier on test set: {:.2f}'.format(
    knn.score(X_test, y_test)))

# CART carrear features

# In[34]:

from sklearn.tree import DecisionTreeRegressor
# from matplotlib import pyplot

# define the model
modelcart = DecisionTreeRegressor()
# fit the model
modelcart.fit(X_train, y_train)
# get importance
Beispiel #7
0
# df_avg == 3 year rolling average + yr4 stats
X,y = df_avg[['all_avg']].values, df_avg['all_yr'].values
X,y = df_avg[['all_prev']].values, df_avg['all_yr'].values
X,y = df_avg[['all_avg', 'all_prev']].values, df_avg['all_yr'].values

X,y = df_avg[['1D_avg', '2D_avg', '3D_avg', 'all_avg','1D_prev', '2D_prev', '3D_prev', 'all_prev']].values, df_avg['all_yr'].values
X_train, X_test, y_train, y_test = tts(X, y)

lin = LR(fit_intercept=False)
lin.fit(X,y)
lin.score(X,y)

knn = KNR(n_neighbors=5)
knn.fit(X_train,y_train)
print knn.score(X_train,y_train)
print knn.score(X_test,y_test)


ns = range(1,30,2)
scores = []
for n in ns:
	knn = KNR(n_neighbors=n)
	knn.fit(X_train,y_train)
	scores.append(knn.score(X_train,y_train))


rf = RFR(n_estimators = 50)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)
Beispiel #8
0
   def Regression(self, columns, predictName, portion=0.5, **argv):
      if type(columns) != list:
         raise Exception("First parameter must be a list.")
      if type(predictName) != str:
         raise Exception("Second parameter must be a string.")

      unknown_columns = set(columns)-set(self.keys())
      if unknown_columns != set([]):
         raise Exception("Invalid columns: " + str(unknown_columns))

      option = {}

      if argv.get('method') is None:
         method = 'LinearRegression'
      # perform function
      elif argv.get('method') == "LinearRegression":
         method = "LinearRegression"
      elif argv.get('method') == "RandomForest":
         method = "RandomForest"
      elif argv.get('method') == "SVM":
         method = "SVM"
      elif argv.get('method') == "KNeighbors":
         method = "KNeighbors"
      else:
         raise Exception("Unknown regression method: " + argv.get('method'))

      # specify portion
      if argv.get('portion') is None:
         portion = 0.5
      else:
         portion = argv.get('portion') 

      ## Select data
      ## need to fix the problem with NaN or infinate value
      x_rows = self.get(columns)
      # x_rows = Imputer().fit_transform(x_rows)
      y_rows = self.get(predictName)
      # y_rows = Imputer().fit_transform(y_rows)

      xRowNum = 0
      yRowNum = 0
      for x in x_rows.index:
         if numpy.any(numpy.isnan(x_rows.loc[x])):
            print("Dropped: ", x, x_rows.loc[x])
            x_rows.drop(x_rows.loc[x])
            y_rows.drop(y_rows.loc[x])


      if (xRowNum < yRowNum):
         y_rows = y_rows[:xRowNum-2]

      if (xRowNum > yRowNum):
         x_rows = x_rows[:yRowNum-2]

      # get portion
      if portion > 1:
         portion = 1
      elif portion < 0:
         portion = 0.5


      # specify neighbor
      if argv.get('n_neighbors') is None:
         n_neighbors = 3
      else:
         n_neighbors = argv.get('n_neighbors') 


      # round
      x_train, x_test, y_train, y_test = train_test_split(x_rows, y_rows, test_size = 1-portion)
      
      # perform function
      if argv.get('method') == "LinearRegression":
         lin = lm.LinearRegression()
         lin.fit(x_rows, y_rows)
         self['linearPredict'] = ""
         for index in x_test.index:
            self['linearPredict'].loc[index] = lin.predict(x_test.loc[index])
         print ("Method: ", method, "\tCoefficients: ", lin.coef_, "\tVariance score: %.2f", lin.score(x_test, y_test))

      # perform function
      elif argv.get('method') == "RandomForest":
         ran = RandomForestRegressor()
         ran.fit(x_train, y_train)
         self['RandomForestClassifier'] = ""
         for index in x_test.index:
            self['RandomForestClassifier'].loc[index] = ran.predict(x_test.loc[index])
         print ("Method: ", method, "\t Score: ", ran.score(x_test, y_test))
      
      elif argv.get('method') == "SVM":
         sssvm = svm.SVR()
         sssvm.fit(x_train, y_train)
         self['svmPredict'] = ""
         for index in x_test.index:
            self['svmPredict'].loc[index] = sssvm.predict(x_test.loc[index])
         
         print ("Method: ", method, "\t Score: ", sssvm.score(x_test, y_test))

      elif argv.get('method') == "KNeighbors":
         knn = KNeighborsRegressor(n_neighbors)
         knn.fit(x_train, y_train)
         self['KNeighborsPredict'] = ""
         for index in x_test.index:
            self['KNeighborsPredict'].loc[index] = knn.predict(x_test.loc[index])
         print ("Method: ", method, "\t Score: ", knn.score(x_test, y_test))
#Using mglearn library  and one nearest neighbor
# Generate Dataset
import mglearn as mglearn
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_wave(n_samples=40)

# split the wave dataset into training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#instantiate the model and set the number of neighbors to consider to 3
reg = KNeighborsRegressor(n_neighbors=3)
#fit the model using the training data and training targets
reg.fit(X_train, y_train)
print("Test set predictions: \n{}".format(reg.predict(X_test)))
print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test)))

# Generate Dataset
import mglearn as mglearn
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_wave(n_samples=40)
# split the wave dataset into training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# create 1,000 data points, evenly spaced  between -3 and 3
clfpoly3.fit(X_train, y_train)

##KNN (k nearest neighbour) uses similar features to predict values of datapoint
#KNN Regression
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)



#EVALUATION (this is done using the score method in each trained model)
# The score method finds the mean accuracy of self.predict(x) with y of the test dataset

confidencereg = clfreg.score(X_test, y_test)    #for linear reg
confidencereg2 = clfpoly2.score(X_test, y_test) #for quadratic reg
confidencereg3 = clfpoly3.score(X_test, y_test) #for quadratic reg
confidenceregknn = clfknn.score(X_test, y_test) #for knn reg

print(confidencereg,confidencereg2,confidencereg3,confidenceregknn)

#core (>0.95) for most of the models. However this does not mean we can blindly place our stocks. 
#There are still many issues to consider, especially with different companies that have different price trajectories over time

forecast_set = clf.predict(X_lately)  #this needs check
dfreg['Forecast'] = np.nan

#result should be an array

##Prediction plot
last_date = dfreg.iloc[-1].name
last_unix = last_date
next_unix = last_unix + datetime.timedelta(days=1)
##y_train = y_train[:1800]
##print x_train.column[1]
##print y_train.column[1]
####y_train_300.astype(dtype=np.float64)
####y_train_300 = np.dtype('f8')

####reg = DecisionTreeRegressor(max_depth=5)
####reg.fit(x_train,y_train)
####predicts = reg.predict(x_test)
####print "total_error:", metrics.mean_squared_error(y_test, predicts)
####print "accuracy:", reg.score(x_train, y_train)
reg2 = KNR()
reg2.fit(x_all, y_all)
####predicts2 = reg2.predict(x_test)
####print "total_error:", metrics.mean_squared_error(y_test, predicts2)
print "accuracy:", reg2.score(x_all, y_all)

####reg3 = svm.SVR()
####reg3.fit(x_train, y_train) 
####SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
####   kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
####predicts3 = reg3.predict(x_test)
####print "total_error:", metrics.mean_squared_error(y_test, predicts3)
####print "accuracy:", reg3.score(x_train, y_train)
##predictions = reg.predict(test_x_all)
##print predicts

###print "total error:", total_error
for d in [test_data]:
    d['Image'] = test_data['Image'].apply(lambda im: np.fromstring(im, sep=' '))
# stack all test images into one numpy array
Beispiel #12
0
MapMean_list = np.array(new_model.get_MapMean_list()[:ratio])
RedMean_list = np.array(new_model.get_RedMean_list()[:ratio])
target_list = np.array(new_model.get_target_list()[:ratio])


interData_list = []
for i in range(len(MapMean_list)):
	interData_list.append(np.hstack((MapMean_list[i],RedMean_list[i])))


from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(MapFeature_list,MapMean_list)
with open('Map.model','wb') as MP:
	cPickle.dump(knn,MP)
print knn.score(MapFeature_list,MapMean_list)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(RedFeature_list,RedMean_list)
with open('Reduce.model','wb') as RP:
	cPickle.dump(regressor,RP)
print regressor.score(RedFeature_list,RedMean_list)



knn2 = KNeighborsRegressor(p=1)
knn2.fit(interData_list,target_list)
with open('Job.model','wb') as JP:
	cPickle.dump(knn2,JP)
print knn2.score(interData_list,target_list)
res = mod_fit.resid
fig = sm.qqplot(res)
#plt.show()




#################


average_score = []                  #knn result
for k in [5, 10,50,100,150,200, 1000]:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, y, test_size=0.05, random_state=0)
    neigh = KNeighborsRegressor(n_neighbors=k)
    neigh.fit(X_train, y_train)
    average_score.append(neigh.score(X_test,y_test))
    
  

#Comparatively k = 10 is best  
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, y, test_size=0.05, random_state=0)
neigh = KNeighborsRegressor(n_neighbors=10)
neigh.fit(X_train, y_train)
predict_result1 = neigh.predict(data)




#############################classification

subre_list = ['videos', 'todayilearned', 'nba','funny', 'DestinyTheGame', 'AdviceAnimals','hockey','WTF', 'worldnews','pcmasterrace','soccer','anime','gaming','serialpodcast','GlobalOffensive','leagueoflegends','news','nfl','CFB','pics','movies','AskReddit','DotA2']
def knn(df1, features, pred_var, df2):
    cl = KNeighborsRegressor(n_neighbors=3)
    cl.fit(df1[features], df1[pred_var])
    print 'KNN Score: ', cl.score(df2[features], df2[pred_var])
Beispiel #15
0
    'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker',
    'non-striker', 'total'
]
for i in l:
    outliers_removed = remove_outliers(dataset, i)
print(outliers_removed)

#dropeddata.to_csv('dropeddata.csv',index=False)
outliers_removed.to_csv('outliers_removed.csv', index=False)

#r square= 88.95 and custome is 86.42
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=1)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
score9 = neigh.score(X_train, y_train) * 100
print("R Squre value:", score9)
print("Custome accuracy for KNeighborsRegressor:",
      custom_accuracy(y_test, y_pred, 20))
# Testing with a custom input
import numpy as np
new_prediction = neigh.predict(sc.transform(np.array([[100, 0, 13, 50, 50]])))
print("Prediction score:", new_prediction)

models = [
    'RandomForestRegression', 'LinearRegression', 'Losso', 'GaussianNB',
    'DecisionTreeRegressor', 'KNeighborsRegression', 'SupportVectorMachine'
]
acc_score = [0.77, 0.43, 0.27, 0.37, 0.78, 0.87, 0.49]
plt.rcParams['figure.figsize'] = (15, 7)
plt.bar(models,
    if labels_train[ii] == 1
]
bumpy_slow = [
    features_train[ii][1] for ii in range(0, len(features_train))
    if labels_train[ii] == 1
]

#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(features_train, labels_train)
accuracy = neigh.score(features_test, labels_test)
print accuracy

try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
    pass

# create two random wine sets
wine_set_1 = numpredict.wine_set_1()
wine_set_2 = numpredict.wine_set_2()

# break these sets into training data and testing data
train1, test1 = numpredict.divide_data(wine_set_1, test=0.07)
train2, test2 = numpredict.divide_data(wine_set_2, test=0.07)

# format the sets into numpy arrays suitable for scikit-learn
train1_X, train1_y = get_pair(train1)
test1_X, test1_y = get_pair(test1)

train2_X, train2_y = get_pair(train2)
test2_X, test2_y = get_pair(train2)

# create two regressors
knn1 = KNeighborsRegressor()
knn2 = KNeighborsRegressor()

# train them using the training sets
knn1.fit(train1_X, train1_y)
knn2.fit(train2_X, train2_y)

# check out their scores
print "Accuracy score for predications made on the first wine set:",
print "%0.2f%%" % (knn1.score(test1_X, test1_y) * 100)
print "Accuracy score for predications made on the second wine set:",
print "%0.2f%%" % (knn2.score(test2_X, test2_y) * 100)
Beispiel #18
0
X, y = mglearn.datasets.make_wave(n_samples=100)
plt.scatter(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Instantiate the model, set the number of neighbors to consider to 5:
reg = KNeighborsRegressor(n_neighbors=5)
# Fit the model using the training data and training targets:
reg.fit(X_train, y_train)
KNeighborsRegressor(algorithm='auto',
                    leaf_size=30,
                    metric='minkowski',
                    metric_params=None,
                    n_jobs=1,
                    n_neighbors=5,
                    p=2,
                    weights='uniform')

reg.score(X_test, y_test)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# create 1000 data points, evenly spaced between -3 and 3
line = np.linspace(-3, 3, 1000).reshape(-1, 1)
plt.suptitle("nearest_neighbor_regression")
for n_neighbors, ax in zip([1, 3, 9], axes):
    # make predictions using 1, 3 or 9 neighbors
    reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y)
    ax.plot(X, y, 'o')
    ax.plot(X, -3 * np.ones(len(X)), 'o')
    ax.plot(line, reg.predict(line))
    ax.set_title("%d neighbor(s)" % n_neighbors)
Beispiel #19
0
# Exact loop as before
training_accuracy = []
test_accuracy = []


neighbors_settings = range(1, 51)


for n_neighbors in neighbors_settings:
    # build the model
    clf = KNeighborsRegressor(n_neighbors = n_neighbors)
    clf.fit(X_train, y_train)
    
    # record training set accuracy
    training_accuracy.append(clf.score(X_train, y_train))
    
    # record generalization accuracy
    test_accuracy.append(clf.score(X_test, y_test))


plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()


# Printing highest test accuracy
print(test_accuracy.index(max(test_accuracy)) + 1)
Beispiel #20
0
    # print(mean_squared_error(y_test, predictColumn))

    # PCA
    pca = decomposition.PCA()
    pca.fit(X_train)
    print(pca.explained_variance_)

    pca.n_components = 5
    reducedTrainSet = pca.fit_transform(X_train)

    pcaClf = linear_model.LinearRegression()
    pcaClf.fit(reducedTrainSet,y_train)

    reducedTestSet = pca.transform(X_test)
    print(pcaClf.score(reducedTestSet,y_test))

    predictColumn = pcaClf.predict(reducedTestSet)
    print(mean_squared_error(y_test, predictColumn))

    print('KNN')
    neighReg = KNRegressor(n_neighbors = 10)
    neighReg.fit(X_train,y_train)
    print(neighReg.score(X_test,y_test))
    predictColumn = neighReg.predict(X_test)
    print(mean_squared_error(y_test, predictColumn))

    #
    # Latent Discriminant Analysis
    # linDA = LDA()
    # linDA.fit(X_train,y_train)
    
Beispiel #21
0
def get_predictability(X, y, dtype='continuous'):
    """Returns scores for various models when given a dataframe and target set
    
    Arguments:
        X (dataframe)
        y (series)
        dtype (str): categorical or continuous
        
        Note: X and y must be equal in column length 
        
    Returns:
        results (dataframe)
    """
    M = pd.concat([X, y], axis=1)
    fortrain = M.dropna()

    X_ft = fortrain.iloc[:, :-1]
    y_ft = fortrain.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X_ft,
                                                        y_ft,
                                                        test_size=0.1)

    # use mean as the prediction
    y_train_mean = y_train.mean()
    y_pred_mean = np.zeros(len(y_test))
    y_pred_mean.fill(y_train_mean)

    # use median as the prediction
    y_train_median = y_train.median()
    y_pred_median = np.zeros(len(y_test))
    y_pred_median.fill(y_train_median)

    # use mode as the prediction
    # zero index is required to return the first most common value
    y_train_mode = y_train.mode()[0]
    y_pred_mode = np.zeros(len(y_test))
    y_pred_mode.fill(y_train_mode)

    lm = LinearRegression()
    print("Fitting linear regression model")
    lm.fit(X_train, y_train)

    rf = RandomForestRegressor()
    print("Fitting random forest model")
    rf.fit(X_train, y_train)

    kN = KNeighborsRegressor()
    print("Fitting kNN model")
    kN.fit(X_train, y_train)

    # get the r2 score for each model
    mean_score = r2_score(y_test, y_pred_mean)
    median_score = r2_score(y_test, y_pred_median)
    mode_score = r2_score(y_test, y_pred_mode)
    lm_score = lm.score(X_test, y_test)
    rf_score = rf.score(X_test, y_test)
    kN_score = kN.score(X_test, y_test)

    # get the mse for each model
    mean_mse = mean_squared_error(y_test, y_pred_mean)
    median_mse = mean_squared_error(y_test, y_pred_median)
    mode_mse = mean_squared_error(y_test, y_pred_mode)

    lm_y_pred = lm.predict(X_test)
    rf_y_pred = rf.predict(X_test)
    kN_y_pred = kN.predict(X_test)
    lm_mse = mean_squared_error(y_test, lm_y_pred)
    rf_mse = mean_squared_error(y_test, rf_y_pred)
    kN_mse = mean_squared_error(y_test, kN_y_pred)

    # construct the dataframe to return to the user
    names = [
        'mean', 'median', 'mode', 'LinearRegression', 'RandomForestRegressor',
        'KNeighborsRegressor'
    ]
    scores = [
        mean_score, median_score, mode_score, lm_score, rf_score, kN_score
    ]
    losses = [mean_mse, median_mse, mode_mse, lm_mse, rf_mse, kN_mse]

    results = pd.DataFrame(data=list(zip(names, scores, losses)),
                           columns=['names', 'r2 score', 'loss'])
    results['r2 score'] = results['r2 score'].apply(lambda x: round(x, 0))
    results['loss'] = results['loss'].apply(lambda x: round(x, 0))
    return results
Beispiel #22
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
n_dots = 80
X = 5 * np.random.rand(n_dots, 1)
y = np.cos(X).ravel()

y += 0.2 * np.random.rand(n_dots) - 0.1

k = 5
knn = KNeighborsRegressor()
knn.fit(X, y)

# 生成足够密集的点并进行预测
T = np.linspace(0, 5, 500)[:, np.newaxis]
# np.newaxis 为 numpy.ndarray(多维数组)增加一个轴
y_pred = knn.predict(T)
knn.score(X, y)

# 画出拟合曲线
plt.figure(figsize=(8, 6))
plt.scatter(X, y, c='g', label='data', s=100)         # 画出训练样本
plt.plot(T, y_pred, c='k', label='prediction', lw=4)  # 画出拟合曲线
plt.axis('tight')
plt.title("KNeighborsRegressor (k = %i)" % k)
plt.show()
Beispiel #23
0
from sklearn.neighbors import KNeighborsRegressor
import mglearn
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_wave(n_samples=40)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# instantiate the model, set the number of neighbors to consider to 3
reg = KNeighborsRegressor(n_neighbors=3)

# Fit the model using the training data and training targets
reg.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto',
                    leaf_size=30,
                    metric='minkowski',
                    metric_params=None,
                    n_jobs=1,
                    n_neighbors=3,
                    p=2,
                    weights='uniform')

print(reg.predict(X_test))

print(reg.score(X_test, y_test))
#res = DataFrame(res.toarray(), columns=features, index=observations)

#print len(res)
#print len(values)

clf = KNeighborsRegressor(n_neighbors=5,weights='distance')
clf.fit(df,values)


Y_train = clf.predict(df)

Y = clf.predict(test_df)

Y = [element for element in list for list in Y]

print clf.score(df,values)
#output_file.write('Id,Hazard\n')

output_file.write('Actual,Predicted\n')

diff_count = 0


for id,predicted in zip(list(test_df_id.values.flatten()),Y):
    output_file.write(str(id)+','+str(predicted)+'\n')


#for id,predicted in zip(list(test_df_id.values.flatten()),Y_train):
#    output_file.write(str(id)+','+str(predicted)+'\n')

plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)

# ### Regression

# In[ ]:

from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)

knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)

print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))

# In[ ]:

fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))
X_predict_input = np.linspace(-3, 3, 50).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5],
                                                    y_R1[0::5],
                                                    random_state=0)

for thisaxis, K in zip(subaxes, [1, 3]):
    knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    thisaxis.set_xlim([-2.5, 0.75])
    thisaxis.plot(X_predict_input,
                  y_predict_output,
Beispiel #26
0
# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

# KNN Regression
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)

confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test, y_test)
confidencepoly3 = clfpoly3.score(X_test, y_test)
confidenceknn = clfknn.score(X_test, y_test)

# Testing the different models with 80-20 rule for checking the confidence level of the system
print('The linear regression confidence is ', confidencereg)
print('The quadratic regression 2 confidence is ', confidencepoly2)
print('The quadratic regression 3 confidence is ', confidencepoly3)
print('The knn regression confidence is ', confidenceknn)

# this could be any prediction model.
forecast_set = clfreg.predict(X_lately)

dfreg['Forecast'] = np.nan

last_date = dfreg.iloc[-1].name
print(last_date)
last_unix = last_date
Beispiel #27
0
l_svr = SVR(kernel='linear')
l_svr.fit(X_train, Y_train)
print(l_svr.score(X_test, Y_test))

n_svr = SVR(kernel="poly")
n_svr.fit(X_train, Y_train)
print(n_svr.score(X_test, Y_test))

r_svr = SVR(kernel="rbf")
r_svr.fit(X_train, Y_train)
print(r_svr.score(X_test, Y_test))

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(weights="uniform")
knn.fit(X_train, Y_train)
print(knn.score(X_test, Y_test))

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, Y_train)
print(rfr.score(X_test, Y_test))

lr = LinearRegression()
lr.fit(X_train, Y_train)
print(lr.score(X_test, Y_test))
'''
#载入模型
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
############################################################# # Part 4

# In[]  implementing knn regression

from sklearn.neighbors import KNeighborsRegressor

x_train80, x_test80, y_train80, y_test80 = train_test_split(x_home,
                                                            y_home,
                                                            random_state=0)

knnreg = KNeighborsRegressor(n_neighbors=5).fit(x_train80, y_train80)

print(knnreg.predict(x_test80))

print('R-squared test score: {:.3f}'.format(knnreg.score(x_test80, y_test80)))

# In[]: selecting the best value of k

# plot k-NN regression on sample dataset for different values of K
fig, subaxes = plt.subplots(5, 1, figsize=(5, 20))
#X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)
X_train98, X_test98, y_train98, y_test98 = train_test_split(x_home,
                                                            y_home,
                                                            test_size=0.20,
                                                            random_state=0)
X_train99, X_test99, y_train99, y_test99 = train_test_split(x_home,
                                                            y_home,
                                                            test_size=0.25,
                                                            random_state=0)
X_train100, X_test100, y_train100, y_test100 = train_test_split(x_home,
Beispiel #29
0
#!/usr/bin/env python
#-*- coding = utf-8 -*-
import numpy as np
import pandas as pd
import mglearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X,y = mglearn.datasets.make_wave(n_samples=40)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

#分析KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
fig, axes = plt.subplots(1,3,figsize=(15,4))
#创建1000个数据点,在-3和3之间均匀分布.(linspace函数)
line = np.linspace(-3,3,1000).reshape(-1,1)
for n_eighbors,ax in zip([1,3,9], axes):
    #利用1个、3个、9个邻居进行预测
    reg = KNeighborsRegressor(n_neighbors=n_eighbors)
    reg.fit(X_train, y_train)#拟合
    ax.plot(line,reg.predict(line))#预测
    ax.plot(X_train,y_train, '^', c=mglearn.cm2(0), markersize=8)
    ax.plot(X_test,y_test,'v',c=mglearn.cm2(1),markersize=8)
    ax.set_title("{} neighbors(s)\n train score:{:.2f} test score:{:.2f}".format(n_eighbors, reg.score(X_train,y_train),reg.score(X_test,y_test)))
    ax.set_xlabel("Feature")
    ax.set_ylabel("Target")
axes[0].legend(["Model predictions", "Training data/target", "Teat data/target"], loc="best")
plt.show()
Beispiel #30
0
class AutoEnRegressor:
    def __init__(self,
                 LA=True,
                 SVR=False,
                 RF=True,
                 AB=False,
                 KNN=False,
                 random_state=0,
                 GridSearch=False,
                 scoring='r2'):

        self.__LA = LA
        self.__SVR = SVR
        self.__RF = RF
        self.__AB = AB
        self.__KNN = KNN
        self.__random_state = random_state
        self.__GridSearch = GridSearch
        if not GridSearch:
            warnings.warn('model will use RandomizedSearch')
        self.__scoring = scoring

    def fit(self,
            X_train,
            y_train,
            validation_split=0.2,
            validation_data=False):

        self.__storing_model_names = []
        self.__X_train = X_train
        self.__y_train = y_train
        if validation_data:
            self.__X_test = validation_data[0]
            self.__y_test = validation_data[1]
        else:
            self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(
                X_train,
                y_train,
                test_size=validation_split,
                random_state=self.__random_state)

        if self.__LA:
            AutoEnRegressor.LA_model_fit(self, param_grid=None)
            self.__storing_model_names.append('LA_score')
        if self.__SVR:
            AutoEnRegressor.SVR_model_fit(self, param_grid=None)
            self.__storing_model_names.append('SVR_score')
        if self.__RF:
            AutoEnRegressor.RF_model_fit(self, param_grid=None)
            self.__storing_model_names.append('RF_score')
        if self.__AB:
            AutoEnRegressor.AB_model_fit(self, param_grid=None)
            self.__storing_model_names.append('AB_score')
        if self.__KNN:
            AutoEnRegressor.KNN_model_fit(self, list_neighbors=None)
            self.__storing_model_names.append('KNN_score')

        AutoEnRegressor.find_best(self)

    def LA_model_fit(self, param_grid=None):
        from sklearn.linear_model import Lasso
        LA_model = Lasso()
        if param_grid == None:
            parameters = {'alpha': [0.01, 0.5, 1, 2, 5]}
            if self.__GridSearch:
                self.__LA_model = GridSearchCV(estimator=LA_model,
                                               param_grid=parameters,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__LA_model = RandomizedSearchCV(
                    estimator=LA_model,
                    param_distributions=parameters,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        else:
            if self.__GridSearch:
                self.__LA_model = GridSearchCV(estimator=LA_model,
                                               param_grid=param_grid,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__LA_model = RandomizedSearchCV(
                    estimator=LA_model,
                    param_distributions=param_grid,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        self.__LA_model.fit(self.__X_train, self.__y_train)
        print(
            f'LA_score : {r2_score(self.__y_test,self.__LA_model.predict(self.__X_test))}'
        )

    def SVR_model_fit(self, param_grid=None):
        from sklearn.svm import SVR
        SVR_model = SVR()
        if param_grid == None:
            parameters = [{
                'kernel': ['rbf', 'poly'],
                'gamma': [1e-3, 1e-4],
                'C': [1, 10, 100, 1000]
            }]

            if self.__GridSearch:
                self.__SVR_model = GridSearchCV(estimator=SVR_model,
                                                param_grid=parameters,
                                                cv=5,
                                                scoring=self.__scoring,
                                                n_jobs=-1)
            else:
                self.__SVR_model = RandomizedSearchCV(
                    estimator=SVR_model,
                    param_distributions=parameters,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        else:
            if self.__GridSearch:
                self.__SVR_model = GridSearchCV(estimator=SVR_model,
                                                param_grid=param_grid,
                                                cv=5,
                                                scoring=self.__scoring,
                                                n_jobs=-1)
            else:
                self.__SVR_model = RandomizedSearchCV(
                    estimator=SVR_model,
                    param_distributions=param_grid,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        self.__SVR_model.fit(self.__X_train, self.__y_train)
        print(
            f'SVR_score : {r2_score(self.__y_test,self.__SVR_model.predict(self.__X_test))}'
        )

    def RF_model_fit(self, param_grid=None):
        from sklearn.ensemble import RandomForestRegressor
        RF_model = RandomForestRegressor()
        if param_grid == None:
            parameters = {
                'n_estimators': [10, 50, 100, 500],
                'max_depth': [4, 8, 10, 12, 16],
                'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5]
            }

            if self.__GridSearch:
                self.__RF_model = GridSearchCV(estimator=RF_model,
                                               param_grid=parameters,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__RF_model = RandomizedSearchCV(
                    estimator=RF_model,
                    param_distributions=parameters,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        else:
            if self.__GridSearch:
                self.__RF_model = GridSearchCV(estimator=RF_model,
                                               param_grid=param_grid,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__RF_model = RandomizedSearchCV(
                    estimator=RF_model,
                    param_distributions=param_grid,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)

        self.__RF_model.fit(self.__X_train, self.__y_train)
        print(
            f'RF_score : {r2_score(self.__y_test,self.__RF_model.predict(self.__X_test))}'
        )

    def AB_model_fit(self, param_grid=None):
        from sklearn.ensemble import AdaBoostRegressor
        AB_model = AdaBoostRegressor()
        if param_grid == None:
            parameters = {
                'n_estimators': [10, 50, 100, 500],
                'learning_rate': [0.01, 0.5, 0.1, 0.15, 0.2],
            }

            if self.__GridSearch:
                self.__AB_model = GridSearchCV(estimator=AB_model,
                                               param_grid=parameters,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__AB_model = RandomizedSearchCV(
                    estimator=AB_model,
                    param_distributions=parameters,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)
        else:
            if self.__GridSearch:
                self.__AB_model = GridSearchCV(estimator=AB_model,
                                               param_grid=param_grid,
                                               cv=5,
                                               scoring=self.__scoring,
                                               n_jobs=-1)
            else:
                self.__AB_model = RandomizedSearchCV(
                    estimator=AB_model,
                    param_distributions=param_grid,
                    cv=5,
                    scoring=self.__scoring,
                    n_jobs=-1)

        self.__AB_model.fit(self.__X_train, self.__y_train)
        print(
            f'AB_score : {r2_score(self.__y_test,self.__AB_model.predict(self.__X_test))}'
        )

    def KNN_model_fit(self, list_neighbors=None):
        from sklearn.neighbors import KNeighborsRegressor
        if list_neighbors == None:
            list_neighbors = [3, 5, 7, 9, 11, 13, 15]
            n_neighbor_score_model = [None, 0, None]
            for neighbor in list_neighbors:
                self.__KNN_model = KNeighborsRegressor(n_neighbors=neighbor)
                self.__KNN_model = self.__KNN_model.fit(
                    self.__X_train, self.__y_train)
                model_score = self.__KNN_model.score(self.__X_test,
                                                     self.__y_test)
                if model_score > n_neighbor_score_model[1]:
                    n_neighbor_score_model[0] = neighbor
                    n_neighbor_score_model[1] = model_score
                    n_neighbor_score_model[2] = self.__KNN_model

            self.__KNN_model = n_neighbor_score_model[2]
            y_predict = self.__KNN_model.predict(self.__X_test)
            print(
                f'KNN_score with {n_neighbor_score_model[0]} neighbors: {r2_score(self.__y_test,y_predict)}'
            )

    def find_best(self):

        global combinations
        combinations = []

        Total_models = self.__LA + self.__SVR + self.__RF + self.__KNN + self.__AB

        combinations = np.array(find_all_combinations(Total_models))
        all_proba = []
        count = 1

        self.__best_score = [0] + [None] * Total_models

        if self.__LA:

            LA_model_y_predict_proba = self.__LA_model.predict(self.__X_test)
            all_proba.append(LA_model_y_predict_proba)
            if self.__best_score[count] == None:
                count += 1

        if self.__SVR:

            SVR_model_y_predict_proba = self.__SVR_model.predict(self.__X_test)
            all_proba.append(SVR_model_y_predict_proba)
            if self.__best_score[count] == None:
                count += 1

        if self.__RF:

            RF_model_y_predict_proba = self.__RF_model.predict(self.__X_test)
            all_proba.append(RF_model_y_predict_proba)
            if self.__best_score[count] == None:
                count += 1

        if self.__AB:
            AB_model_y_predict_proba = self.__AB_model.predict(self.__X_test)
            all_proba.append(AB_model_y_predict_proba)
            if self.__best_score[count] == None:
                count += 1

        if self.__KNN:
            KNN_model_y_predict_proba = self.__KNN_model.predict(self.__X_test)
            all_proba.append(KNN_model_y_predict_proba)
            if self.__best_score[count] == None:
                count += 1

        all_proba = np.array(all_proba)

        all_proba = np.sum(np.multiply(combinations.T,
                                       np.array([all_proba]).T).T,
                           axis=1)

        for y_predict, comb in zip(all_proba, combinations):

            latest_score = r2_score(self.__y_test, y_predict)

            if latest_score > self.__best_score[0]:

                self.__best_score[0] = latest_score
                for i in range(0, len(comb)):
                    self.__best_score[i + 1] = comb[i]

        print(f'AutoEn_score : {self.__best_score[0]}')
        for i in range(len(self.__storing_model_names)):

            print(
                f'weight for {self.__storing_model_names[i]} : {self.__best_score[i+1]}'
            )

    def predict(self, X_test):
        all_proba = []
        count = 1
        try:

            if self.__LA:
                LA_model_y_predict = self.__LA_model.predict(X_test)
                LA_model_y_predict = np.multiply(LA_model_y_predict,
                                                 self.__best_score[count])
                all_proba.append(LA_model_y_predict)
                count += 1

            if self.__SVR:
                SVR_model_y_predict = self.__SVR_model.predict(X_test)
                SVR_model_y_predict = np.multiply(SVR_model_y_predict,
                                                  self.__best_score[count])
                all_proba.append(SVR_model_y_predict)
                count += 1

            if self.__RF:
                RF_model_y_predict = self.__RF_model.predict(X_test)
                RF_model_y_predict = np.multiply(RF_model_y_predict,
                                                 self.__best_score[count])
                all_proba.append(RF_model_y_predict)
                count += 1

            if self.__AB:
                AB_model_y_predict = self.__AB_model.predict(X_test)
                AB_model_y_predict = np.multiply(AB_model_y_predict,
                                                 self.__best_score[count])
                all_proba.append(AB_model_y_predict)
                count += 1

            if self.__KNN:
                KNN_model_y_predict = self.__KNN_model.predict(X_test)
                KNN_model_y_predict = np.multiply(KNN_model_y_predict,
                                                  self.__best_score[count])
                all_proba.append(KNN_model_y_predict)
                count += 1

            y_predict = np.sum(all_proba, axis=0)

        except AttributeError:
            print('model not fitted yet')
            return None

        except:
            print('something went wrong')
            return None

        return y_predict
Beispiel #31
0
mlr_train_pred = mlr.predict(x_train)
mlr_test_pred = mlr.predict(x_test)

# Evaluate the MLR model
mlr_rsq = mlr.score(x_train, y_train)
mlr_test_rsq = mlr.score(x_test, y_test)
mlr_rmse = np.sqrt(mean_squared_error(y_train, mlr_train_pred))
mlr_test_rmse = np.sqrt(mean_squared_error(y_test, mlr_test_pred))

# Create the nonlinear model using the KNN
knn = KNeighborsRegressor().fit(x_train, y_train)
knn_train_pred = knn.predict(x_train)
knn_test_pred = knn.predict(x_test)

# Evaluate the KNN model
knn_rsq = knn.score(x_train, y_train)
knn_test_rsq = knn.score(x_train, y_train)
knn_rmse = np.sqrt(mean_squared_error(y_train, knn_train_pred))
knn_test_rmse = np.sqrt(mean_squared_error(y_test, knn_test_pred))

# Model evaluation using theR2 and the RMSE metrics
print(
    '\nMLR R-Squared : {:.3f} for the training set, and {:.3f} for the testing set'
    .format(mlr_rsq, mlr_test_rsq))
print(
    'MLR RMSE : {:.3f} for the training set, and {:.3f} for the testing set\n'.
    format(mlr_rmse, mlr_test_rmse))
print(
    'KNN R-Squared : {:.3f} for the training set, and {:.3f} for the testing set'
    .format(knn_rsq, knn_test_rsq))
print('KNN RMSE : {:.3f} for the training set, and {:.3f} for the testing set'.
Beispiel #32
0
data = np.array(df)
inputs = data[:, 0:2]
outputs = data[:, 8]

#Split into train and test set

from sklearn.cross_validation import train_test_split
X, X_test, y, y_test = train_test_split(inputs, outputs)

#Weightd KNN

from sklearn.neighbors import KNeighborsRegressor
reg = KNeighborsRegressor()
reg.fit(X, y)
y_pred = reg.predict(X_test)
print('R^2 Score : ', reg.score(X_test, y_test))
accuracy = (np.sum(1 - abs((y_test - y_pred) / y_test)) / (y_test.size)) * 100
print('Acuracy : ', accuracy)

#Plot of Cu variation along LAT and LONG

fig0 = plt.figure(figsize=(20, 10))
fig0.canvas.set_window_title('Variation Of Cu with Latitude and Longitude')
plt.subplots_adjust(hspace=0.5)
plt.subplot(2, 1, 1)
plt.scatter(X[:, 0], y)
plt.title('Variation of Cu with Latitude')
plt.xlabel('Latitude')
plt.ylabel('Cu')

plt.subplot(2, 1, 2)
Beispiel #33
0
# plt.plot(kneighbors, testscore, label='test')
# plt.legend()
# plt.show() # k = 3이 적당함을 그래프를 통해 알 수 있다.

#forge 데이터 셋 - KNN분류 : 0, 1등으로 분류
# X,y = mglearn.datasets.make_forge()
# mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
# plt.show()

#wave 데이터 셋 - KNN회귀 : 실제값을 예측. 즉, 이웃을 사용해서 그것의 평균이 예측값이 됨
X, y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(X, y, 'o')
plt.show()

mglearn.plots.plot_knn_classification(n_neighbors=3)
plt.show()

mglearn.plots.plot_knn_regression(n_neighbors=3)
plt.show()

# 회귀분석 간단예제
X = [[1], [2], [3], [4], [5]]
y = [0, 0, 1, 1, 1.5]

rgr = KNeighborsRegressor(n_neighbors=3)
rgr.fit(X, y)

print('훈련 측정값 R^2', rgr.score(X, y))

#생성된 회귀모형을 검증
print(rgr.predict([[1.6], [1.7], [2.3], [3.5]]))
                       names=l3,
                       na_values="?")
x3 = dataset3.iloc[:, 0:25].values
y3 = dataset3["Price"].values

Imp = SimpleImputer(
    strategy="most_frequent")  # Simple Imputer (use before label encoding)
x3[:, 0:25] = Imp.fit_transform(x3[:, 0:25])
y3 = Imp.fit_transform(y3.reshape(-1, 1))

labelencoder = LabelEncoder()
x3[:, 2] = labelencoder.fit_transform(x3[:, 2].astype(str))
x3[:, 3] = labelencoder.fit_transform(x3[:, 3].astype(str))
x3[:, 4] = labelencoder.fit_transform(x3[:, 4].astype(str))
x3[:, 5] = labelencoder.fit_transform(x3[:, 5].astype(str))
x3[:, 6] = labelencoder.fit_transform(x3[:, 6].astype(str))
x3[:, 7] = labelencoder.fit_transform(x3[:, 7].astype(str))
x3[:, 8] = labelencoder.fit_transform(x3[:, 8].astype(str))
x3[:, 14] = labelencoder.fit_transform(x3[:, 14].astype(str))
x3[:, 15] = labelencoder.fit_transform(x3[:, 15].astype(str))
x3[:, 17] = labelencoder.fit_transform(x3[:, 17].astype(str))

X_train, X_test, Y_train, Y_test = train_test_split(x3, y3)

df = KNeighborsRegressor(n_neighbors=3)
df.fit(X_train, Y_train)
df.score(X_train, Y_train)
df.score(X_test, Y_test)

mnist = fetch_mldata('MNIST original')
Beispiel #35
0
print(f"y_test[0:20]:{y_test[0:20]}")
print(f"y_pre[0:20]:{y_pre[0:20]}")



#모델구성-2
print("-"*30)
print("KNeighborsRegressor")

model2 = KNeighborsRegressor(1)

#트레이닝

# model.compile(loss="categorical_crossentropy",metircs=["acc"])
model2.fit(x_train,y_train)
score = model2.score(x_test,y_test)
#test

# loss,ac=model.evaluate(x_test,y_test)

y_pre = model2.predict(x_test)
# print(f'r2:{r2_score(y_test,y_pre)}')

print(f"score:{score}")
print(f"r2:{r2_score(y_test,y_pre)}")
print(f"acc:{acc(y_test,y_pre)}")
print(f"y_test[0:20]:{y_test[0:20]}")
print(f"y_pre[0:20]:{y_pre[0:20]}")


#모델구성-3
Beispiel #36
0
print(test_array.shape)

test_array = test_array.reshape(2, 2)
print(test_array.shape)

train_input = train_input.reshape(-1, 1)
test_input = test_input.reshape(-1, 1)

print(train_input.shape, test_input.shape)

# 결정계수 R**2
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)

knr.score(test_input, test_target)
# 0.9928

from sklearn.metrics import mean_absolute_error
test_prediction = knr.predict(test_input)
mae = mean_absolute_error(test_target, test_prediction)
print(mae)  # 19.157

# 과대적합 vs 과소적합
print(knr.score(train_input, train_target))
# 0.9698
knr.n_neighbors = 3
knr.fit(train_input, train_target)
print(knr.score(train_input, train_target))
# 0.980
print(knr.score(test_input, test_target))
Beispiel #37
0
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)

#KNN
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn = KNeighborsRegressor(algorithm='brute')
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
knn.score(X_test, y_test)

#votingRegressor
from sklearn.ensemble import VotingRegressor
reg1 = GradientBoostingRegressor()
reg2 = RandomForestRegressor()
reg3 = LinearRegression()
reg4 = DecisionTreeRegressor()
reg5 = KNeighborsRegressor()
reg6 = AdaBoostRegressor()
ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2)])
ereg = ereg.fit(X_train, y_train)
ereg.score(X_train, y_train)
ereg.score(X_test, y_test)
Beispiel #38
0
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=11)

# instantiate the model and set the number of neighbors to consider to 3
reg_knn = KNeighborsRegressor(n_neighbors=3)
# fit the model using the training data and training targets
reg_knn.fit(X_train, y_train)

print("Test set predictions:\n", reg_knn.predict(X_test))
print("Test set R^2: {:.2f}".format(reg_knn.score(X_test, y_test)))

from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=10).fit(X_train, y_train)

print("Test set predictions:\n", tree.predict(X_test))
print("Test set R^2: {:.2f}".format(tree.score(X_test, y_test)))

check = X_test.copy()

check['Y_hat'] = tree.predict(X_test)
check['Y_test'] = y_test

check['diff'] = check['Y_test'] - check['Y_hat']
# k-Nearest Neighbor Regression
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsRegressor
# load the datasets
dataset = datasets.load_diabetes()
# fit a model to the data
model = KNeighborsRegressor()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
# instantiate the model and set the number of neighbors to consider to 3
reg=KNeighborsRegressor(n_neighbors=3)

# fit the model using the training data and training targets
reg.fit(X_train,y_train)

print('test set predictions : \n{}'.format(reg.predict(X_test)))
"""
We can also evaluate the model using the score method, which for regressors returns
the R2 score. The R2 score, also known as the coefficient of determination, is a measure
of goodness of a prediction for a regression model, and yields a score between 0
and 1. A value of 1 corresponds to a perfect prediction, and a value of 0 corresponds
to a constant model that just predicts the mean of the training set responses, y_train
"""
print('Test set R^2 : {:.2f}'.format(reg.score(X_test,y_test)))

# Analyzing KNeighborsRegressor------------------------------------------------
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# create 1,000 data points, evenly spaced between -3 and 3
line = np.linspace(-3, 3, 1000).reshape(-1, 1)
for n_neighbors, ax in zip([1, 3, 9], axes):
    # make predictions using 1, 3, or 9 neighbors
    reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    reg.fit(X_train, y_train)
    ax.plot(line, reg.predict(line))
    ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8)
    ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8)
    ax.set_title("{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format(
    n_neighbors, reg.score(X_train, y_train),
    reg.score(X_test, y_test)))
Beispiel #41
0
mse = mean_squared_error(y_test, lasso_pred)
print("Root Mean Squared Error: ", np.sqrt(mse))

fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(111)
ax.plot(y_test.index, lasso_pred, label='Predicted')
ax.plot(y_test, label='Test')
ax.legend()
plt.show()

# Evaluation
confidence_lr = lr.score(X_test, y_test)
confidence_poly2 = poly2.score(X_test, y_test)
confidence_poly3 = poly3.score(X_test, y_test)
confidence_knn = knn.score(X_test, y_test)
confidence_lasso = lasso.score(X_test, y_test)

print("Results: ", confidence_lr, confidence_poly2, confidence_poly3,
      confidence_knn, confidence_lasso)

# all on one graph

fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(111)
ax.plot(y_test.index, lasso_pred, label='Lasso', color='red')
ax.plot(y_test.index, knn_pred, label='KNN', color='blue')
ax.plot(y_test.index, poly2_pred, label='Poly2', color='green')
ax.plot(y_test.index, poly3_pred, label='Poly3', color='orange')
ax.plot(y_test.index, y_pred_lr, label='LR', color='cyan')
ax.plot(y_test, label='Test', color='magenta')
Beispiel #42
0
# Linear Regression
lin3 = LR()
#lin3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)
lin3.fit(X_train, y_train)
print "Train: ", lin3.score(X_train, y_train)
print "Test: ", lin3.score(X_test, y_test)
print "Intercept: ", lin3.intercept_
for k, v in enumerate(lin3.coef_[0]):
	print threeYrXcol[k], ": ", v

# KNeighborsRegressor
kn3 = KNReg(weights='uniform')
#kn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)
kn3.fit(X_train, y_train)
print "Train: ", kn3.score(X_train, y_train)
print "Test: ", kn3.score(X_test, y_test)
# print kn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)

# RadiusNeighborsRegressor
rn3 = RNReg(radius=7.0)
#rn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)
rn3.fit(X_train, y_train)
print "Train: ", rn3.score(X_train, y_train)
print "Test: ", rn3.score(X_test, y_test)
print rn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values)

# Test 2010/11/12 stats and 2013 projections against 2013 actuals
y=2013
y3 = [y-1,y-2,y-3]
tms_include = np.intersect1d(df[df.Year == y3[0]].Team.values, df[df.Year == y3[2]].Team.values)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, Y_train)
uni_knr_Y_predict = uni_knr.predict(X_test)

# 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, Y_train)
dis_knr_Y_predict = dis_knr.predict(X_test)

print('The R-squared value of uniform-weighted KNeighborsRegression is:',
      uni_knr.score(X_test, Y_test))
# 使用mean_squared_error模块,并输出评估结果
print("The mean squared error of uniform-weighted KNeighborsRegression is:",
      mean_squared_error(Y_test, uni_knr_Y_predict))
# 使用mean_absolute_error模块,并输出评估结果
print("The mean absolute error of uniform-weighted KNeighborsRegression is:",
      mean_absolute_error(Y_test, uni_knr_Y_predict))

print('The R-squared value of distance-weighted KNeighborsRegression is:',
      dis_knr.score(X_test, Y_test))
# 使用mean_squared_error模块,并输出评估结果
print("The mean squared error of distance-weighted KNeighborsRegression is:",
      mean_squared_error(Y_test, dis_knr_Y_predict))
# 使用mean_absolute_error模块,并输出评估结果
print("The mean absolute error of distance-weighted KNeighborsRegression is:",
      mean_absolute_error(Y_test, dis_knr_Y_predict))
Beispiel #44
0



# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)。
from sklearn.neighbors import KNeighborsRegressor

# 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'。
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict = uni_knr.predict(X_test)

# 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'。
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


# 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, y_test))
print( 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))
print('The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))


# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test))
print( 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))	)
Beispiel #45
0
# 準備
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor

my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']

# 訓練
my_model = KNeighborsRegressor()
my_model.fit(X, y)

# 可視化の準備
tmp = pd.DataFrame(
    {'speed': np.linspace(min(my_data.speed), max(my_data.speed), 100)})
tmp['model'] = my_model.predict(tmp)

pd.concat([my_data, tmp]).plot(x='speed', style=['o', '-'])

y_ = my_model.predict(X)

((y - y_)**2).mean()**0.5
#> 13.087184571174962 # RMSE

my_model.score(X, y)
#> 0.7368165812204317 # 決定係数1

np.corrcoef(y, y_)[0, 1]**2
#> 0.7380949412509705 # 決定係数6

# find the most correlated variables
cols_x = ['number_of_open_credit_lines_and_loans', 'number_of_dependents', 'number_real_estate_loans_or_lines']

train_not_null = train[cols_x][rows_train]
test_not_null = test[cols_x][rows_test]


lr = LinearRegression()
lr.fit(train_not_null, train['monthly_income'][rows_train])
print lr.score(test_not_null, test['monthly_income'][rows_test])
# score: 0.0478125755667
knn = KNeighborsRegressor(n_neighbors=120)
knn.fit(train_not_null, train['monthly_income'][rows_train])
print knn.score(test_not_null, test['monthly_income'][rows_test])
# score: 0.00680687486842

# use linear regression model as imputer


# In[10]:

train[rows_train].corr().ix[:, 5]


# In[11]:

train_null = train[cols_x][~rows_train]
test_null = test[cols_x][~rows_test]
# k-Nearest Neighbor Regression
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsRegressor

# load the datasets
dataset = datasets.load_diabetes()

# fit a model to the data
model = KNeighborsRegressor()
model.fit(dataset.data, dataset.target)
print(model)

# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
mse = np.mean((predicted - expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
Beispiel #48
0
def scicrossvalidate(data,k=3):
	X_train, X_test, y_train, y_test = cross_validation.train_test_split([data[i]['input'] for i in range(len(data))],[data[i]['result'] for i in range(len(data))], test_size=0.4, random_state=0)
	neigh = KNeighborsRegressor(n_neighbors=k, weights=scigaussian)
	neigh.fit(X_train,y_train)
	return neigh.score(X_test,y_test)
Beispiel #49
0
ax4.scatter(range(len(y_test)),y_test,label='data')
ax4.plot(range(len(y_test)),y_pred_gb,color='black',label='GB model')
ax4.legend()

f2,(ax5,ax6) = plt.subplots(1,2,figsize=(30,10))
# quardratic poly 2
ax5.scatter(range(len(y_test)),y_test,label='data')
ax5.plot(range(len(y_test)),y_pred_qd,color='blue',label='Quadratic model')
ax5.legend()

# KNN
ax6.scatter(range(len(y_test)),y_test,label='data')
ax6.plot(range(len(y_test)),y_pred_knn,color='black',label='KNN model')
ax6.legend()




print("Accuracy of Linear Regerssion Model:",clf_lr.score(x_test,y_test))
print("Accuracy of SVM-RBF Model:",clf_svr.score(x_test,y_test))
print("Accuracy of Random Forest Model:",clf_rf.score(x_test,y_test))
print("Accuracy of Gradient Boosting Model:",clf_gb.score(x_test,y_test))
print("Accuracy of quadratic model:",clfpoly2.score(x_test,y_test))
print("Accuracy of knn Model:",clfknn.score(x_test,y_test))






Beispiel #50
0
# distance metrics: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html#sklearn.neighbors.DistanceMetric
# KNN http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html
print "Regression test..."
games_train, games_test, regress_train, regress_test = train_test_split(
        games_matrix, regress_vec, test_size=TRAIN_SPLIT)
n = KNeighborsRegressor(
        n_neighbors=NEIGHBORS,         # saw highest accuracy with 20
        algorithm='kd_tree',
        weights='uniform',      # saw highest accuracy with uniform
        #weights='distance',
        #metric='minkowski', p=2,
        n_jobs = 3, # number of CPU cores (-1 for all)
        )
n.fit(games_train, regress_train)
print "Accuracy training data:", n.score(games_train, regress_train)
print "Accuracy test data:", n.score(games_test, regress_test)
print
if not RUN_TOURNAMENT:
    print "Some predictions:"
    for i in range(0,10):
        print "  ", n.predict([games_test[i]]), regress_test[i]
    print

print "Classification test..."
games_train, games_test, class_train, class_test = train_test_split(
        games_matrix, class_vec, test_size=TRAIN_SPLIT)
n = KNeighborsClassifier(
        n_neighbors=NEIGHBORS,
        algorithm='kd_tree',
        weights='uniform',      # saw highest accuracy with uniform
Beispiel #51
0
class DataHandler(object):
    """
    Handle the diabetes data set
    """
    def __init__(self):
        self.times = []
        self.labels = None
        self.features = None
        self.data_list = parse_all()
        self.user = self.data_list[0]['data']
        self.activities = []
        self.dates = []
        self.glucose = []
        self.k_fold = []
        self.classifier = KNeighborsRegressor()

    def parse_data(self):
        """
        Parse the data_list into our features
        """
        train_data = []
        test_data = []

        for user in self.data_list:
            for data in user['data']:
                # I wan't to compress all the dataset in a 24 hours timeframe
                time = data['date'].replace(year=2000, month=1, day=1)

                # Save the features
                self.dates.append(time)
                self.activities.append(data['activity'])
                self.glucose.append(data['glucose'])
                train_data.append([int(time.strftime("%H")), int(time.strftime("%M")), data['activity']])
                test_data.append([float(data['glucose'])])

        self.features = np.array(train_data)
        self.labels = np.array(test_data)

    def load_k_fold(self):
        """
        Load the test cases into K folds
        """
        self.k_fold = KFold(len(self.data_list), n_folds=len(self.data_list)*0.2)

    def fit(self):
        """
        Train the classifier with the train data
        """
        self.classifier.fit(self.features, self.labels)

    def print_score(self):
        """
        calculates the score on the given kfold test cases
        """
        for train, test in self.k_fold:
            print(self.classifier.score(self.features[test], self.labels[test]))

    def predict(self, date, activity):
        """
        Predict what will be the glucose rate on a given activity and data
        """
        return self.classifier.predict([int(date.strftime('%H')), int(date.strftime('%M')), activity])

    def plot(self):
        """
        Matplotlib it!
        """
        plt.plot(self.dates, self.glucose, 'go')
        plt.plot(self.dates, self.activities, 'ro')
        plt.show()