Esempio n. 1
1
def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5):
	model = KNeighborsRegressor(n_neighbors=n_neighbors)
	model.fit(features_train, labels_train)
	labels_pred = model.predict(features_pred)
	score = model.score(features_train, labels_train)
	print "KNeighbors - coefficient of determination R^2 of the prediction: ", score
	return labels_pred
Esempio n. 2
1
def fill_income(df):

    income_imputer = KNeighborsRegressor(n_neighbors=2)
    df_w_monthly_income = df[df.monthly_income.isnull() == False].copy()
    df_w_null_monthly_income = df[df.monthly_income.isnull() == True].copy()
    cols = ["number_real_estate_loans_or_lines", "number_of_open_credit_lines_and_loans"]
    income_imputer.fit(df_w_monthly_income[cols], df_w_monthly_income.monthly_income)
    new_values = income_imputer.predict(df_w_null_monthly_income[cols])
    df_w_null_monthly_income.loc[:, "monthly_income"] = new_values
    df2 = df_w_monthly_income.append(df_w_null_monthly_income)
    return df2
def knnPredictor(df):

    dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df)
    corelationCoefficiantDictionary = {}
    corelationCoefficiantArray = []

    for k in range(1, 200, 1):
        knnModel = KNeighborsRegressor(n_neighbors=k)

        knnModel.fit(dataTrainX, dataTrainY)

        knnpredicted = knnModel.predict(dataTestX)
        corelationCoefficient = pearsonr(dataTestY, knnpredicted)
        corelationCoefficiantDictionary[k] = corelationCoefficient[0]
        corelationCoefficiantArray.append(corelationCoefficient[0])

    # plotter.plot(corelationCoefficiantArray)
    bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)

    knnModelBest = KNeighborsRegressor(n_neighbors=bestK)
    knnModelBest.fit(dataTrainX, dataTrainY)
    print("K = ")
    print(bestK)
    print("Corelation Coeff:")
    print(corelationCoefficiantDictionary[bestK])

    knnpredictedBest = knnModelBest.predict(dataTestX)

    fig, ax = plotter.subplots()
    corelationCoefficient = pearsonr(dataTestY, knnpredictedBest)
    print(corelationCoefficient[0])
    ax.set_ylabel('Predicted KNN Weekly')
    ax.scatter(dataTestY, knnpredictedBest)
    ax.set_xlabel('Measured')
    plotter.show()
Esempio n. 4
0
def predictKnn(data, priceToPredict):
    corelationCoefficiantDictionary = {}
    corelationCoefficiantArray = []
    openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \
        data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"]

    for k in range( 1 , 100 , 1):
        neigh = KNeighborsRegressor(n_neighbors=k)
        #n = 7 best fits
        neigh.fit(openingPriceTrain, closingPriceTrain)

        closingPriceTestArray = np.reshape(closingPriceTest,-1)
        knnpr = neigh.predict(openingPriceTest)
        predictedArray = np.reshape(knnpr,-1)

        corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray)
        corelationCoefficiantDictionary[k] = corelationCoefficient[0]
        corelationCoefficiantArray.append(corelationCoefficient[0])
    plotter.plot(corelationCoefficiantArray)
    # plotter.show()

    bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
    neighBest = KNeighborsRegressor(n_neighbors=bestK)
    neighBest.fit(openingPriceTrain, closingPriceTrain)
    openingPriceToPredict = np.array([priceToPredict])
    print("K = ")
    print(bestK)
    print(neighBest.predict(openingPriceToPredict))
class PersonalityPredictor(object):
    def __init__(self, nn):
        self.nn = nn
        self.o_clf = KNeighborsRegressor(n_neighbors=self.nn)
        self.c_clf = KNeighborsRegressor(n_neighbors=self.nn)
        self.e_clf = KNeighborsRegressor(n_neighbors=self.nn)
        self.a_clf = KNeighborsRegressor(n_neighbors=self.nn)
        self.n_clf = KNeighborsRegressor(n_neighbors=self.nn)
        self.features = []
        self.o_value = []
        self.c_value = []
        self.e_value = []
        self.a_value = []
        self.n_value = []

    def register(self, data):
        for user_id in data:
            if 'f' in data[user_id]:
                self.o_value.append(self.make_float(data[user_id]['o']))
                self.c_value.append(self.make_float(data[user_id]['c']))
                self.e_value.append(self.make_float(data[user_id]['e']))
                self.a_value.append(self.make_float(data[user_id]['a']))
                self.n_value.append(self.make_float(data[user_id]['n']))
                self.features.append(data[user_id]['f'])
            else:
                break

    def make_float(self, value):
        if isinstance(value, basestring):
            return float(re.sub("[^0-9.]", "", value))
        else:
            return float(value)

    def train(self):
        self.features = normalize(self.features)
        self.o_clf.fit(self.features, self.o_value)
        self.c_clf.fit(self.features, self.c_value)
        self.e_clf.fit(self.features, self.e_value)
        self.a_clf.fit(self.features, self.a_value)
        self.n_clf.fit(self.features, self.n_value)

    def predict(self, features):
        o = self.o_clf.predict([features]).tolist()[0]
        c = self.c_clf.predict([features]).tolist()[0]
        e = self.e_clf.predict([features]).tolist()[0]
        a = self.a_clf.predict([features]).tolist()[0]
        n = self.n_clf.predict([features]).tolist()[0]
        return {
                'o': o,
                'c': c,
                'e': e,
                'a': a,
                'n': n
                }
Esempio n. 6
0
 def add_geo(self):
     model = KNeighborsRegressor(n_neighbors=1)
     x = self.zip[['Lat', 'Long']].values
     derived = ['TaxReturnsFiled', 'EstimatedPopulation', 'EstWages', 'DependencyRatio']
     y = self.zip[derived].values
     model.fit(x, y)
     train_feats = model.predict(self.train[['latitude', 'longitude']].values)
     test_feats = model.predict(self.test[['latitude', 'longitude']].values)
     tr = pandas.DataFrame(train_feats, columns=derived)
     te = pandas.DataFrame(test_feats, columns=derived)
     self.merge(tr, self.train)
     self.merge(te, self.test)
def predict_missing_data_for_column(features,missing_column,params,num_boost_round,test_size,train_file_name,test_file_name):
    print("## Train a XGBoost model for filling missing column : " + str(missing_column))
    
    X_missing_data_train = train[train[missing_column].isnull()]
    X_missing_data_test = test[test[missing_column].isnull()]
    
    X_data_train = train[np.isfinite(train[missing_column])]
    X_data_test = test[np.isfinite(test[missing_column])]
    
    X_data = pd.concat([X_data_train,X_data_test])
    X_data = X_data.iloc[np.random.permutation(len(X_data))]
        
    #print(X_missing_data[missing_column])
    #print(X_data[missing_column])
    
    y_data = X_data[missing_column]

    # -- Replacing with KNN --
    # dtrain = xgb.DMatrix(X_train[features], y_train)
    # dvalid = xgb.DMatrix(X_valid[features], y_valid)
    #
    # watchlist = [(dtrain, 'train'),(dvalid, 'eval')]
    # fgbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=True)
    #
    # print("## Predicting missing data for column : " + str(missing_column))
    #
    # if not X_missing_data_train.empty:
    #     fpreds = fgbm.predict(xgb.DMatrix(X_missing_data_train[features]),ntree_limit=fgbm.best_ntree_limit)
    #     train.loc[train[missing_column].isnull(),missing_column] = fpreds
    #
    # if not X_missing_data_test.empty:
    #     fpreds = fgbm.predict(xgb.DMatrix(X_missing_data_test[features]),ntree_limit=fgbm.best_ntree_limit)
    #     test.loc[test[missing_column].isnull(),missing_column] = fpreds
    # --------------------------

    ngbr = KNeighborsRegressor()
    ngbr.fit(X_data[features], y_data)

    # print("## Predicting missing data for column : " + str(missing_column))
    if not X_missing_data_train.empty:
        fpreds = ngbr.predict(X_missing_data_train[features])
        train.loc[train[missing_column].isnull(),missing_column] = fpreds
    if not X_missing_data_test.empty:
        fpreds = ngbr.predict(X_missing_data_test[features])
        test.loc[test[missing_column].isnull(),missing_column] = fpreds
    
    # train.to_csv(train_file_name, index=False)
    # test.to_csv(test_file_name, index=False)
    
    print("##########################################################################################################################")
Esempio n. 8
0
def neighbors_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False):
    model = KNeighborsRegressor(n_neighbors=25)
    if use_cache:
        fhand = open(cache_name, 'r')
        data_dict = pickle.load(fhand)
        return data_dict['test_pred'], data_dict['valid_pred']
    np.random.seed(seed=123)
    model.fit(x_train, np.log(y_train))
    test_pred = np.exp(model.predict(x_test))
    valid_pred = np.exp(model.predict(x_valid))
    data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred}
    fhand = open(cache_name, 'w')
    pickle.dump(data_dict, fhand)
    fhand.close()
    return test_pred, valid_pred
Esempio n. 9
0
def kNN(X_train, y_train, X_test, y_test, uselog=False):
  '''

  :param X_train:
  :param y_train:
  :param X_test:
  :param y_test:
  :return:
  '''

  scaler = StandardScaler()
  print X_train.shape
  print X_test.shape

  X = scaler.fit_transform(X_train)
  test = scaler.transform(X_test)

  clf = KNeighborsRegressor(n_neighbors=550)

  clf.fit(X, y_train)

  result = clf.predict(test)

  if uselog:
    result = map(lambda x: math.log(1 + x), result)

  return result
    def transform(self, X, y=None):
        """
        :param X: multidimensional numpy array like.
        """
        rows, features = X.shape

        mask = list(map(lambda x: reduce(lambda h, t: h or t, x), np.isnan(X)))
        criteria_for_bad = np.where(mask)[0]
        criteria_for_good = np.where(mask == np.zeros(len(mask)))[0]

        X_bad = X[criteria_for_bad]
        X_good = X[criteria_for_good]

        knn = KNeighborsRegressor(n_neighbors=self.k)

        for idx, x_bad in zip(criteria_for_bad.tolist(), X_bad):
            missing = np.isnan(x_bad)
            bad_dim = np.where(missing)[0]
            good_dim = np.where(missing == False)[0]

            for d in bad_dim:
                x = X_good[:, good_dim]
                y = X_good[:, d]
                knn.fit(x, y)

                X[idx, d] = knn.predict(x_bad[good_dim])

        return X
Esempio n. 11
0
	def __init__(self,dataFrame):
		self.dataFrameKNN = {}
		self.KNNWeightage = {'Avg-High Ratio':100,'Avg-Low Ratio':100,'Deliverable Qty':300,'Turnover':100,'Growth':150,'Trend':100,'Output':100}
		self.valid = True
		self.KNNModelHash = {}
		self.dataFrameKNN = pd.DataFrame()
		self.dataFrameKNN['Avg-High Ratio'] = dataFrame['High Price'][1:] - dataFrame['Average Price'][1:]
		self.dataFrameKNN['Avg-Low Ratio'] = dataFrame['Average Price'][1:] - dataFrame['Low Price'][1:]
		self.dataFrameKNN['Deliverable Qty'] = dataFrame['Deliverable Qty'][1:]
		self.dataFrameKNN['Turnover'] = dataFrame['Turnover in Lacs'][1:]
		self.dataFrameKNN['Growth'] = dataFrame['Close Price'][1:]-dataFrame['Prev Close'][1:]
		self.dataFrameKNN['Trend'] = dataFrame['Turnover in Lacs'][1:]
		self.dataFrameKNN['Output'] = dataFrame['High Price'][1:]-dataFrame['Prev Close'][1:]
		self.KNNModelHash['mean'] = self.dataFrameKNN['Output'].mean()
		self.KNNModelHash['std'] = self.dataFrameKNN['Output'].std()
		for key in self.dataFrameKNN:
			self.normalizeKNNModel(key)
		#trainData has the data to be trained, but the last data is the testData
		trainData =	self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][:-1].values
		testData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][-1:].values
		#trainOutput contains the output corresponding to train Data but the first one is garbage
		trainOutput = self.dataFrameKNN['Output'][1:].values
		KNNModel = KNeighborsRegressor(n_neighbors=3,weights = 'distance')
		KNNModel.fit(trainData[100:400],trainOutput[100:400])
		prediction = KNNModel.predict(trainData[400:450])
		weightage = self.KNNWeightage['Output']
		for i in range(50):
			prediction[i] = ((prediction[i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage
			trainOutput[400+i] = ((trainOutput[400+i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage
			print "%-40s %-40s " %(prediction[i],trainOutput[400+i])
Esempio n. 12
0
def calc_linear_regression(reg_training_path):
    dataset = read_reg_train_data(reg_training_path)
    rmse = 0
    n_folds = 5
    folds = KFold(n=len(dataset), n_folds=n_folds, shuffle=False)

    fold = 0
    for train_indices, test_indices in folds:
        fold += 1
        training_set = [dataset[i] for i in train_indices]
        test_set = [dataset[i] for i in test_indices]
        training_dataframe = get_data_frame(training_set)
        test_dataframe = get_data_frame(test_set)
        column_names = ['cf_item', 'cf_user', 'svd', 'content_item', 'actual_rating']
        training_dataframe.columns = column_names
        test_dataframe.columns = column_names

        actual_rating_training_column = training_dataframe['actual_rating']
        #actual_rating_test_column = test_dataframe['actual_rating']

        training_dataframe = training_dataframe.drop('actual_rating', axis=1)
        test_dataframe = test_dataframe.drop('actual_rating', axis=1)

        neigh = KNeighborsRegressor(n_neighbors=10)
        #print('Initialized k nearest neighbors regressor with k =', i)
        neigh.fit(training_dataframe, actual_rating_training_column)
        #print('Fit data models')
        predict_set = neigh.predict(test_dataframe)
        print(predict_set)
        rmse += mean_squared_error([rec[4] for rec in test_set], [rec for rec in predict_set]) ** 0.5
        print("Fold (%d) finished with accumulated RMSE of (%f) (%s)" % (fold, rmse, time.strftime('%y_%m_%d_%H_%M_%S')))
    return rmse / float(n_folds)
Esempio n. 13
0
 def smooth(self, X, y):
   # KNN algorithm for smooth
   nbrs = KNeighborsRegressor(n_neighbors = 20)
   X = X.reshape(-1, 1)
   nbrs.fit(X, y)
   proba = nbrs.predict(X)
   return proba
Esempio n. 14
0
def knn_model(train, y_train, test):
    model = KNeighborsRegressor(n_neighbors = 10, weights='distance', n_jobs=-1)
    model.fit(train, y_train)
    test_probs = model.predict(test)
    indices = test_probs < 0
    test_probs[indices] = 0
    return test_probs
Esempio n. 15
0
def run_network(mdl=None, data=None):
    global_start_time = time.time()
    sequence_length = 10

    if data is None:
        print('Loading data... ')
        X_train, y_train, X_test, y_test = train_test_traffic_data(15773, sequence_length)
    else:
        X_train, y_train, X_test, y_test = data

    print('\nData Loaded...\n')

    if mdl is None:
        mdl = KNeighborsRegressor(5, weights='distance')

    try:
        mdl.fit(X_train, y_train)
        predicted_trffic = mdl.predict(X_test)
    except KeyboardInterrupt:
        print('Training duration (s) : ', time.time() - global_start_time)
        return mdl, y_test, 0

    print('Training duration (s) : ', time.time() - global_start_time)

    return mdl, y_test, predicted_trffic
Esempio n. 16
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_svr = [{
            'n_neighbors': [2, 5, 10, 15]}]
        params = ParameterGrid(params_svr)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            # pdb.set_trace()
            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            X_cv, y_cv = stock.get_data(mid_date, end_date)

            lowest_mse = np.inf
            for i, param in enumerate(params):
                knn = KNeighborsRegressor(**param)
                # ada = AdaBoostRegressor(knn)
                knn.fit(X_train.values, y_train.values)
                mse = mean_squared_error(y_cv, knn.predict(X_cv.values))
                if mse <= lowest_mse:
                    self.models[ticker] = knn

        return self
Esempio n. 17
0
def opt_ex1():
    from sklearn.neighbors import KNeighborsRegressor
    knn = KNeighborsRegressor(12, weights='distance')
    #Train the KNN
    knn.fit(mag_train, z_train)
    #Test it!
    z_fit_train = knn.predict(mag_train)
    z_fit = knn.predict(mag_test)
    #Compute rms in the training set and test set
    rms_train = np.mean(np.sqrt((z_fit_train - z_train) ** 2))
    rms_test = np.mean(np.sqrt((z_fit - z_test) ** 2))
    plt.scatter(z_test,z_fit, color='k', s=0.1)
    plt.plot([-0.1, 6], [-0.1, 6], ':k')
    plt.text(0.04, 5, "rms = %.3f" % (rms_test))
    plt.xlabel('$z_{true}$')
    plt.ylabel('$z_{fit}$')
def main(featureFile, outputfolder):
    with open(featureFile, 'r') as csvfile:
        my_data = pd.read_csv(csvfile, delimiter="\t", low_memory=False)

    random_indices = permutation(my_data.index)
    # how many time do we want the data in our test set?
    test_cutoff = math.floor(len(my_data)/3)
    test = my_data

    # Generate the training set with the rest of the data.
    train = my_data.loc[random_indices[test_cutoff:]]

    x_columns = ["Row"=="1", "Student ID"=="2", "Problem Hierarchy" == "3", "Problem Name"=="4", "Problem View" == "5", "Step Name" == "6",
            "KC(Default)"=="7", "Opportunity (Default)" == "8"]
    x_columns = [int(i) for i in x_columns]
    # y columns show the predicted feature, in this case, the correct first attempt
    y_column = ["Correct First Attempt"]

    # Look at the Ten closest neighbors, to offset potential noise in the data
    knn = KNeighborsRegressor(n_neighbors=10)
    knn.fit(train[x_columns], train[y_column])

    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(test[x_columns])
    actual = test[y_column]
    result = test[['Anon Student Id','Correct First Attempt']]
    result.to_csv(outputfolder, sep='\t')

    # Compute the root mean squared error of our predictions.
    rmse = math.sqrt((((predictions - actual) ** 2).sum()) / len(predictions))
    print('RMSE=')
    print(rmse)
def apply_knn():
    regr = KNeighborsRegressor()
    regr.fit(Xtr, Ytr)

    pred = regr.predict(Xte)
    temp = mean_squared_error(Yte, pred)
    return pred, temp
Esempio n. 20
0
def Round2(X, y):
    # Set parameters
    min_score = {}
    for neigh in [5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:

        model = KNeighborsRegressor(n_neighbors=neigh)
        n = len(y)

        # Perform 5-fold cross validation
        scores = []
        kf = KFold(n, n_folds=5, shuffle=True)

        # Calculate mean absolute deviation for train/test for each fold
        for train_idx, test_idx in kf:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, prediction))
            # score = model.score(X_test, y_test)
            scores.append(rmse)
        if len(min_score) == 0:
            min_score['neighbor'] = neigh
            min_score['scores'] = scores
        else:
            if np.mean(scores) < np.mean(min_score['scores']):
                min_score['neighbor'] = neigh
                min_score['scores'] = scores
        print "Neighbors:", neigh
        print scores
        print np.mean(scores)
    return min_score
def run_kNeighbors(distances, loadings, test_vars, 
                   weightings=('uniform',), k_list=(3)):
    """
    Run Knearest neighbor using precomputed distances to create an ontological mapping
    
    Args:
        distances: square distance matrix to pass to KNeighborsRegressors
        loadings: loading matrix for training
        test_vars: variable to reconstruct
        weightings: (optional) list of weightings to pass to KNeighbors
        k_list: list of k values to pass to KNeighbors as n_neighbors
    """
    train_distances = distances.loc[loadings.index, loadings.index]
    test_distances = distances.loc[test_vars, loadings.index]
    to_return = pd.DataFrame()
    for weighting in weightings:
        for k in k_list:
            clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting)
            clf.fit(train_distances, loadings)
            out = clf.predict(test_distances)
            out = pd.DataFrame(out, columns=loadings.columns)
            out['var'] = test_vars
            out['k'] = k
            out['weighting'] = weighting
            # add neighbors and distances
            neighbors = clf.kneighbors(test_distances)
            out['distances'] = tuple(neighbors[0])
            out['neighbors'] = tuple(test_distances.columns[neighbors[1]])
            to_return = pd.concat([to_return, out], sort=False)
    return to_return
Esempio n. 22
0
    def predictDayType (self,week,day):
        
        knn = KNeighborsRegressor(n_neighbors=5)
        knn.fit(self.rawData, self.dayType)

        X = np.array([week,day])   
        predictions = knn.predict(X)
        return predictions
Esempio n. 23
0
def nnVerify_2(city_data,x,y):
    """ Using SKLearn's KNeighborsRegressor """
    X,Y = city_data.data, city_data.target
    clf = KNeighborsRegressor(n_neighbors=2)
    clf.fit(X,Y)
    y_pred = clf.predict(x)
    print("KNeighborsRegressor")
    print("Y pred(KNN) : ", y_pred)
Esempio n. 24
0
def main():
    # read the images
    image_from = io.imread(name_from) / 256
    image_to = io.imread(name_to) / 256

    # change to hsv domain (if requested)
    if args.use_hsv:
        image_from[:] = rgb2hsv(image_from)
        image_to[:] = rgb2hsv(image_to)

    # get shapes
    shape_from = image_from.shape
    shape_to = image_to.shape

    # flatten
    X_from = im2mat(image_from)
    X_to = im2mat(image_to)

    # number of pixes
    n_pixels_from = X_from.shape[0]
    n_pixels_to = X_to.shape[0]

    # subsample
    X_from_ss = X_from[np.random.randint(0, n_pixels_from-1, n_pixels),:]
    X_to_ss = X_to[np.random.randint(0, n_pixels_to-1, n_pixels),:]

    if save_col_distribution:
        import matplotlib.pyplot as plt
        import seaborn as sns
        sns.set_style('white')

        fig, axes = plt.subplots(nrows=2, figsize=(5, 10))
        for ax, X in zip(axes, [X_from_ss, X_to_ss]):
            ax.scatter(X[:,0], X[:,1], color=X)
            if args.use_hsv:
                ax.set_xhsvel('hue')
                ax.set_yhsvel('value')
            else:
                ax.set_xhsvel('red')
                ax.set_yhsvel('green')
        axes[0].set_title('distr. from')
        axes[1].set_title('distr. to')
        fig.tight_layout()
        fig.savefig('color_distributions.png')

    # optimal tranportation
    ot_color = OptimalTransport(X_to_ss, X_from_ss, lam=lam,
                                    distance_metric=distance_metric)

    # model transfer
    transfer_model = KNeighborsRegressor(n_neighbors=n_neighbors)
    transfer_model.fit(X_to_ss, n_pixels * ot_color.P @ X_from_ss)
    X_transfered = transfer_model.predict(X_to)

    image_transferd = minmax(mat2im(X_transfered, shape_to))
    if args.use_hsv:
        image_transferd[:] = hsv2rgb(image_transferd)
    io.imsave(name_out, image_transferd)
Esempio n. 25
0
class Knn(ContextEngineBase):
    y_Test = np.empty([0])
    # Knn object
    knnRegressor = None

    def __init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict):
        ContextEngineBase.__init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict)
        # Passed parameters
        self.n_neighbors = appFieldsDict['n_neighbors']
        self.weights = appFieldsDict['weights']
        self.algorithm = appFieldsDict['algorithm']
        self.n_jobs = appFieldsDict['n_jobs']
        # Defining a Knn object with given parameters
        self.knnRegressor = KNeighborsRegressor(n_neighbors = self.n_neighbors, 
                                                weights = self.weights,
                                                algorithm = self.algorithm,
                                                n_jobs = self.n_jobs)

    #  Add a set of training observations, with the newInputObsMatrix being a
    #  matrix of doubles, where the row magnitude must match the number of inputs,
    #  and the column magnitude must match the number of observations.
    #  and newOutputVector being a column vector of doubles
    def addBatchObservations(self, newInputObsMatrix, newOutputVector):
        if(len(newInputObsMatrix.shape) == 2 and newInputObsMatrix.shape[1] == self.numInputs
            and newOutputVector.shape[0] == newInputObsMatrix.shape[0]):
            # print("All good!")
            newOutputVector = newOutputVector.ravel()
            i = 0
            for newInputVector in newInputObsMatrix:
                newOutputValue = newOutputVector[i]
                self.addSingleObservation(newInputVector, newOutputValue)
                i += 1
        else:
            print("Wrong dimensions!")

    #  Train the coefficients on the existing observation matrix if there are
    #  enough observations.
    def train(self):
        if (self.numObservations > 0):
            # print("Training started")
            self.knnRegressor.fit(self.observationMatrix, self.outputVector)
            return True
        else:
            print("Not enough observations to train!")
            return False

    #  Execute the trained matrix against the given input observation
    #  inputObsVector is a row vector of doubles
    def execute(self, inputObsVector):
        if(len(inputObsVector) == self.numInputs):
            # print("Begin execute")
            #x_Test = np.vstack((self.x_Test,inputObsVector))
            x_Test = np.reshape(inputObsVector,(1,self.numInputs))
            self.y_Test = self.knnRegressor.predict(x_Test)
            return self.y_Test[0]
        else:
            print("Wrong dimensions, fail to execute")
            return None
Esempio n. 26
0
def impute_KNN(df,var,features,k,):
    var_imputer = KNeighborsRegressor(n_neighbors=k)
    df_full = df[df[var].isnull()==False]
    df_null = df[df[var].isnull()==True]
    var_imputer.fit(df_full[features], df_full[var])
    impute = var_imputer.predict(df_null[features])
    df_null[var] = impute
    df = df_full.append(df_null)
    return df
Esempio n. 27
0
class kNN():
    '''
        kNN classifier
        -------------
    '''

    def __init__(self,N_i,N_o,k=5,n=20):
        # note: N_o=1 assumed for now
        self.N_i = N_i
        self.n = n
        self.i = 0
        self.k = k
        self.X = zeros((self.n,N_i))
        self.y = zeros((self.n))
        self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance')
        self.c = 0
        #self.error_rate = 0

    def predict(self,x):
        '''
            Predict
            --------------
        '''

        if self.c < 1.:
            print "[Warning!] No training examples!"
            return 0.0
        elif self.c <= self.k:
            dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1)
            i_max = argmax(ind)
            return self.y[i_max]

        return self.h.predict(x)#.reshape(1,-1))

#    def samples_X(self):
#        ''' return samples of the WEIGHTS '''
#        if self.c <= 0:
#            return self.X[0,:]
#        return self.X[0:self.c,:]

    def update(self, x, y):
        '''
            Update
            --------------
        '''
        self.X[self.i,:] = x
        self.y[self.i] = y

        #self.error_rate = (y - self.predict(x))**2

        self.i = (self.i + 1) % self.n

        if self.c < self.n:
            self.c = self.c + 1

        self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
Esempio n. 28
0
File: data.py Progetto: dssg/drain
def nearest_neighbors_impute(df, coordinate_columns, data_columns, knr_params={}):
    from sklearn.neighbors import KNeighborsRegressor
    for column in data_columns:
        not_null = df[column].notnull()
        if (~not_null).sum() == 0:
            continue
        knr = KNeighborsRegressor(**knr_params)
        knr.fit(df.loc[not_null,coordinate_columns], df.loc[not_null,[column]])
        predicted = knr.predict(df.loc[~not_null,coordinate_columns])
        df.loc[ (~not_null),[column]] = predicted
Esempio n. 29
0
 def addJKRegionLabels(self):
     data = zip(self.data['RA'],self.data['DEC'])
     randoms = zip(self.randoms['RA'],self.randoms['DEC'])
     
     finder = KMeans(n_clusters=self.config['n_jackknife'])
     self.data_jk_indices = finder.fit_predict(data)
     
     nbrs = KNeighborsRegressor(n_neighbors=1)
     nbrs.fit(data,self.data_jk_indices)
     self.random_jk_indices = nbrs.predict(randoms)
Esempio n. 30
0
def compute_mse(regressor, horizon):
    # get wind park and corresponding target. forecast is for the target
    # turbine
    park_id = NREL.park_id['tehachapi']
    windpark = NREL().get_windpark(park_id, 3, 2004, 2005)
    target = windpark.get_target()

    # use power mapping for pattern-label mapping. Feature window length
    # is 3 time steps and time horizon (forecast) is 3 time steps.
    feature_window = 3
    mapping = PowerMapping()
    X = mapping.get_features_park(windpark, feature_window, horizon)
    Y = mapping.get_labels_turbine(target, feature_window, horizon)

    # train roughly for the year 2004.
    train_to = int(math.floor(len(X) * 0.5))

    # test roughly for the year 2005.
    test_to = len(X)

    # train and test only every fifth pattern, for performance.
    train_step, test_step = 5, 5

    if(regressor == 'linear'):
        # fitting the pattern-label pairs
        reg = linear_model.LinearRegression()
        reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step])
        y_hat = reg.predict(X[train_to:test_to:test_step])
    elif(regressor == 'knn'):
        k_neighbors = 10
        reg = KNeighborsRegressor(k_neighbors, 'uniform')
        # fitting the pattern-label pairs
        reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step])
        y_hat = reg.predict(X[train_to:test_to:test_step])
    else:
        raise Exception("No regressor set.")

    # naive is also known as persistance model.
    naive_hat = zeros(len(y_hat), dtype = float32)
    for i in range(0, len(y_hat)):
        # naive label is the label as horizon time steps before.
        # we have to consider to use only the fifth label here, too.
        naive_hat[i] = Y[train_to + (i * test_step) - horizon]

    # computing the mean squared errors of Linear and naive prediction.
    mse_y_hat, mse_naive_hat = 0, 0
    for i in range(0, len(y_hat)):
        y = Y[train_to + (i * test_step)]
        mse_y_hat += (y_hat[i] - y) ** 2
        mse_naive_hat += (naive_hat[i] - y) ** 2

    mse_y_hat /= float(len(y_hat))
    mse_naive_hat /= float(len(y_hat))

    return mse_y_hat, mse_naive_hat
Esempio n. 31
0
# Importing the dataset
dataset = pd.read_csv('hazelnut.csv')

X = dataset.iloc[:, [0,1,3,4,6,7,8,9,10]]
y = dataset.iloc[:, 11].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
# Fit the classifier to the data
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
y_pred
knn.score(X_test, y_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

plot = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="b", label="Fitted Values" , ax=plot)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

plot = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
Esempio n. 32
0
split_one = dc_listings.iloc[0:1862]
split_two = dc_listings.iloc[1862:]

## 2. Holdout Validation ##

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one
# First half
model = KNeighborsRegressor()
model.fit(train_one[["accommodates"]], train_one["price"])
test_one["predicted_price"] = model.predict(test_one[["accommodates"]])
iteration_one_rmse = mean_squared_error(test_one["price"],
                                        test_one["predicted_price"])**(1 / 2)

# Second half
model.fit(train_two[["accommodates"]], train_two["price"])
test_two["predicted_price"] = model.predict(test_two[["accommodates"]])
iteration_two_rmse = mean_squared_error(test_two["price"],
                                        test_two["predicted_price"])**(1 / 2)

avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse])

print(iteration_one_rmse, iteration_two_rmse, avg_rmse)

## 3. K-Fold Cross Validation ##
Esempio n. 33
0
clf_LB=KNeighborsRegressor(n_neighbors=80,weights='uniform',n_jobs=-1)
clf_HB=KNeighborsRegressor(n_neighbors=30,weights='uniform',n_jobs=-1)
clf_TRI=KNeighborsRegressor(n_neighbors=95,weights='uniform',n_jobs=-1)
clf_HDL=KNeighborsRegressor(n_neighbors=35,weights='uniform',n_jobs=-1)
clf_LDL=KNeighborsRegressor(n_neighbors=35,weights='uniform',n_jobs=-1)

clf_LB.fit(X_train,y_train_LB)
clf_HB.fit(X_train,y_train_HB)
clf_TRI.fit(X_train,y_train_TRI)
clf_HDL.fit(X_train,y_train_HDL)
clf_LDL.fit(X_train,y_train_LDL)


y_pred_LB,y_pred_HB,y_pred_TRI,y_pred_HDL,y_pred_LDL=\
    clf_LB.predict(X_test),clf_HB.predict(X_test),clf_TRI.predict(X_test),\
    clf_HDL.predict(X_test),clf_LDL.predict(X_test)
y_pred_LB,y_pred_HB,y_pred_TRI,y_pred_HDL,y_pred_LDL=\
    pd.DataFrame(y_pred_LB),pd.DataFrame(y_pred_HB),pd.DataFrame(y_pred_TRI),\
    pd.DataFrame(y_pred_HDL),pd.DataFrame(y_pred_LDL)


y_LB,y_HB,y_TRI,y_HDL,y_LDL=\
    clf_LB.predict(X_train),clf_HB.predict(X_train),clf_TRI.predict(X_train),\
    clf_HDL.predict(X_train),clf_LDL.predict(X_train)


print("MSE_LB",mean_squared_log_error(y_train_LB,y_LB))
print("MSE_HB",mean_squared_log_error(y_train_HB,y_HB))
print("MSE_TRI",mean_squared_log_error(y_train_TRI,y_TRI))
print("MSE_HDL",mean_squared_log_error(y_train_HDL,y_HDL))
Esempio n. 34
0
    r.append(np.sqrt(r[0]))
    r.append(r2_score(y_test, y_pred))
    r.append(round(r2_score(y_test, y_pred) * 100, 4))
    return (r)


""" dataframe that store the performance of each model """
accu = pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score', 'Accuracy(%)'])
"""  KNN METHODE @-@  """

#estimating MSLE for k=1-9
R_MSLE = []
for i in range(1, 10):
    KNN = KNeighborsRegressor(n_neighbors=i)
    KNN.fit(X_train, y_train)
    y_pred = KNN.predict(X_test)
    error = np.sqrt(mean_squared_log_error(y_test, y_pred))
    R_MSLE.append(error)
    print("K =", i, " , Root MSLE =", error)
""" plotting error """

curve = pd.DataFrame(R_MSLE)  #elbow curve
plt.figure(figsize=(8, 4))
plt.xticks(list(range(1, 10)), list(range(1, 10)), rotation='horizontal')
plt.plot(list(range(1, 10)), R_MSLE)
plt.xlabel('K')
plt.ylabel('MSLE')
plt.title('Error Plot for Each K')
plt.savefig('KNN-Error-Plot.jpg')
plt.show()
""" model implementation """
regr = KNeighborsRegressor(n_neighbors=5,
                           algorithm='ball_tree',
                           leaf_size=1000,
                           weights='distance',
                           p=1)

regr = MultiOutputRegressor(estimator=regr)

t0 = time.time()
regr.fit(x_train, y_train)
regr_fit = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" %
      regr_fit)

t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

x_test_dim = sc_x.inverse_transform(x_test)
y_test_dim = sc_y.inverse_transform(y_test)
y_regr_dim = sc_y.inverse_transform(y_regr)

plt.scatter(x_test_dim,
            y_test_dim[:, 5],
            s=2,
            c='k',
            marker='o',
            label='Matlab')
plt.scatter(x_test_dim,
            y_regr_dim[:, 5],
Esempio n. 36
0
numpy.random.seed(seed)
tf.set_random_seed(seed)

# 2. 데이터 로드

print(os.getcwd())

dataset = numpy.loadtxt("./data/pima-indians-diabetes.csv", delimiter=",")
X = dataset[:, 0:8]
Y = dataset[:, 8]

# 3. model 설정
# model = KNeighborsClassifier(n_neighbors=1)
model = KNeighborsRegressor(n_neighbors=1)

# model = SVC()
# model complie
# loss='binary_crossentropy' => sigmoid일때 쓴다.

# 4. model 실행
model.fit(X, Y)

# 5. 결과 출력
x_test = X
y_test = Y
y_predict = model.predict(x_test)

print(x_test, "의 예측결과 : ", y_predict)

print("acc = ", accuracy_score(y_test, y_predict))
Esempio n. 37
0
def k_nearest(X_train, y_train, X_test, y_test, val):
    kernel = KNeighborsRegressor(n_neighbors=val)
    kernel.fit(X_train, y_train)
    y_pre = kernel.predict(X_test)
    r2, mse = show_metrics('k nearest neighbors regressor', y_test, y_pre)
    return r2, mse
Esempio n. 38
0
# print("------------------------------------------------------")

# the best score was obtained with this model
# X_new has less features, the least important ones are cut off
X_new = SelectKBest(f_regression, k=9).fit_transform(X_total, Y_total)
X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split(
    X_new, Y_total, test_size=0.25)
reg10best = KNeighborsRegressor(n_neighbors=7, weights="distance").fit(
    X_train_new, Y_train_new)

print("KNN regression train score with 10 best features: ",
      reg10best.score(X_train_new, Y_train_new))
print("KNN regression test score with 10 best features: ",
      reg10best.score(X_test_new, Y_test_new))

Y_predictions = reg10best.predict(X_test_new)
x_ticks = numpy.arange(1., len(Y_predictions) + 1, 1)

errors = []
for a, b in zip(Y_predictions, Y_test_new):
    errors.append(abs(a - b))

# plot true values versus predicted values
predictions = mpatches.Patch(color="green", label="Predictions")
actual_data = mpatches.Patch(color="cyan", label="Actual data")
plt.plot(x_ticks, Y_predictions, "g", x_ticks, Y_test_new, "c")
plt.legend(handles=[predictions, actual_data])
plt.title("Predictions compared to actual results")
plt.show()

# plot error for each point
Esempio n. 39
0
def data_cleaning(df):
    #Removes Unwatned Columns
    #Removes Unwanted Weight Classes
    #Calculate KNN missing Reach Vals
    #Reduce Outlier Data (Weights)
    #Fills Null Values (AGE)
    #Converts Categorical to Dummies
    #Converts Binary to Boolean

    #define wanted columns from OG data
    desired_cols = [
        'Winner',
        'weight_class',
        'B_age',
        'B_Height_cms',
        'B_Reach_cms',
        'B_Weight_lbs',
        'R_Height_cms',
        'R_Reach_cms',
        'R_Weight_lbs',
        'R_age',
    ]

    df = df[desired_cols]

    #Fill NaN age values with column means ¯\_(ツ)_/¯
    df["B_age"] = df["B_age"].fillna(df["B_age"].mean())
    df["R_age"] = df["R_age"].fillna(df["R_age"].mean())

    #Replace Missing Values using KNN
    #Combine all B and R values together for single master list
    r_cols = ["R_Height_cms", "R_Reach_cms"]
    b_cols = ["B_Height_cms", "B_Reach_cms"]
    header = ["Height", "Reach"]

    R_heights_to_reach = df[r_cols]
    R_heights_to_reach.columns = header
    B_heights_to_reach = df[b_cols]
    B_heights_to_reach.columns = header
    MasterHR = R_heights_to_reach.append(B_heights_to_reach, ignore_index=True)

    #Train the KNN Model
    num_neighbors = 3
    trainer = MasterHR.dropna()
    X = np.array(list(trainer["Height"])).reshape(len(trainer), 1)
    y = np.array(list(trainer["Reach"])).reshape(len(trainer), 1)
    nay = KNeighborsRegressor(n_neighbors=num_neighbors).fit(X, y)

    #Replace vals with KNN predictions
    df["R_Reach_cms"] = df.apply(
        lambda x: nay.predict(np.array(x["R_Height_cms"]).reshape(1, 1))[0][0]
        if math.isnan(x["R_Reach_cms"]) else x["R_Reach_cms"],
        axis=1)
    df["B_Reach_cms"] = df.apply(
        lambda x: nay.predict(np.array(x["B_Height_cms"]).reshape(1, 1))[0][0]
        if math.isnan(x["B_Reach_cms"]) else x["B_Reach_cms"],
        axis=1)

    #Remove unwanted weight divisions (ROWS)
    to_drop = [
        'Open Weight', 'Catch Weight', "Women's Strawweight",
        "Women's Flyweight", "Women's Bantamweight", "Women's Featherweight"
    ]
    for each in to_drop:
        df = df[df["weight_class"] != each]

    #Convert Binary Winner to Boolean
    df["Winner"] = df["Winner"].apply(lambda x: True if x == "Red" else False)

    #Calculate Delta Values (RED WINNER MINUS BLUE LOSER)
    df["Reach_Delta"] = df["R_Reach_cms"] - df["B_Reach_cms"]
    df["Height_Delta"] = df["R_Height_cms"] - df["B_Height_cms"]
    df["Weight_Delta"] = df["R_Weight_lbs"] - df["B_Weight_lbs"]
    df["age_Delta"] = df["R_age"] - df["B_age"]

    #Drop Figthers over the 265 Heavyweight limit
    df = df[df["R_Weight_lbs"] <= 265]
    df = df[df["B_Weight_lbs"] <= 265]

    #Drop Red vs Blue Data Columns
    cols = [
        "R_Reach_cms", "B_Reach_cms", "R_Height_cms", "B_Height_cms",
        "R_Weight_lbs", "B_Weight_lbs", "R_age", "B_age"
    ]
    df = df.drop(columns=cols)

    df = dummies(df, "weight_class")

    return df
Esempio n. 40
0
print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, prediction)}")
print(f"Printing MSE error: {metrics.mean_squared_error(y_test, prediction)}")
print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, prediction))}")
print(f"Printing r2 score linear regression: {metrics.r2_score(y_test, prediction)}")

#########################################################################

kreg = KNeighborsRegressor()
kreg.fit(X_train, y_train)

# print(f"Intercept2: {linear.intercept_}\n")
# print(f"Coeficients2: {linear.coef_}\n")
# print(f"Named Coeficients2: {pd.DataFrame(linear.coef_, columns_names)}")

prediction2 = kreg.predict(X_test)


for (real, predicted) in list(zip(y_test, prediction2)):
    print(f"Value: {real:.2f}, pred: {predicted:.2f}, diff: {(real - predicted):.2f}")

sns.set(palette="inferno")

sns.scatterplot(y_test, prediction2)
plt.plot([0, 50], [0, 50], '--')
plt.title('(KNeighbors)')
plt.xlabel('Real Value')
plt.ylabel('Predicted Value')
plt.show()

residuals = y_test - prediction2
Esempio n. 41
0
            'preprocessing.{}().fit_transform(XTransaction)'.format(scaler))

        XPS_train, XPS_test, yP_train, yP_test = train_test_split(
            XPricingS, yPricing, test_size=0.3)
        XTS_train, XTS_test, yT_train, yT_test = train_test_split(
            XTransactionS, yTransaction, test_size=0.3)

        knnP = KNeighborsRegressor(n_neighbors=20)
        knnP.fit(XPS_train, yP_train)

        # print(knnP.predict(XPS_test[:10]))
        # print(np.array(yP_test[:10]))
        # print(knnP.score(XPS_test, yP_test))
        pricingScores[scalerIndex] += knnP.score(XPS_test, yP_test)

        plt.scatter(knnP.predict(XPS_test), np.array(yP_test))
        plt.title(
            'Prediction on Pricing using Normalization of {}'.format(scaler))
        plt.xlabel('Predicted Pricing Price')
        plt.ylabel('Actual Pricing Price')
        plt.figtext(
            0.6, 0.8,
            'KNN Score: {}'.format(round(knnP.score(XPS_test, yP_test), 3)))
        plt.savefig('scaler graphs/{} Prediction'.format(scaler))
        plt.clf()

        knnT = KNeighborsRegressor(n_neighbors=20)
        knnT.fit(XTS_train, yT_train)

        # print(knnT.predict(XTS_test[:10]))
        # print(np.array(yT_test[:10]))
Esempio n. 42
0
class CipPredictor(CipDatabase):
    """Extend the database with a predictor."""
    
    def __init__(self):
        CipDatabase.__init__(self)
        
        self.num_scores_fitted = 0
        self.X = []
        self.y = []

        #self.predictor = SGDRegressor()
        self.predictor = KNeighborsRegressor()
    
    
    def create_features(self):
        """Create feature vectors between database pairs."""
        self.X = []
        self.y = []
        
        self.cip_fvs = self.vectorizer.transform_single(self.cip_graphs)
        
        for interface, core_start, core_end, scores in self.get_items():
            pos_start = self.graph2position[interface][core_start]
            pos_end = self.graph2position[interface][core_end]    
                        
            vector_start = self.cip_fvs[pos_start]
            vector_end = self.cip_fvs[pos_end]

            feature_vector = vector_start - vector_end

            if len(feature_vector.data):
                score = median(scores)
                max_drift = max(score - min(scores), max(scores) - score) 
                        
                self.X.append(feature_vector)
                self.y.append(score)
      
        self.X = vstack(self.X)
        
        
    def cip_fit(self):
        """Fit the predictor."""
        
        self.predictor.fit(self.X, self.y)
        self.num_scores_fitted = self.num_scores
        

    def _predicted_cips(self, original_cip, candidate_cips):
        """Return average scores of a list of candidate cips."""
        original_fv = self.vectorizer.transform_single(original_cip.graph)
        original_fv = original_fv[0]
            
        candidate_graphs = [candidate_cip.graph for candidate_cip in candidate_cips]
        candidate_fvs = self.vectorizer.transform_single(candidate_graphs)
            
        pairwise_fvs = [original_fv - candidate_fv for candidate_fv in candidate_fvs]
        pairwise_fvs = vstack(pairwise_fvs)

        y = self.predictor.predict(pairwise_fvs)

        return zip(y, candidate_cips)
    
    
    def save_cip_data(self):
        """Save database and feature vectors to files."""
        CipDatabase.save_cip_data(self)
        
        dump_svmlight_file(self.X, self.y, 'cip_rank_regression.data', zero_based=False)


    def load_cip_data(self):
        """ Load database and feature vectors from files."""
        CipDatabase.load_cip_data(self)
        self.X, self.y = load_svmlight_file('cip_rank_regression.data', n_features=self.vectorizer.feature_size, zero_based=False)
Esempio n. 43
0
mpg.dropna(inplace=True)
#print(mpg.shape)
mpg_target = mpg["mpg"]
mpg_target = np.asarray(mpg_target)
mpg_data = mpg.iloc[:, 1:7]

mpg_train_data, mpg_test_data, mpg_train_target, mpg_test_target = train_test_split(mpg_data, mpg_target, train_size = .7, test_size = 0.3, random_state = 48, shuffle = True)

std_scale = preprocessing.StandardScaler().fit(mpg_train_data)
mpg_train_data_std = std_scale.transform(mpg_train_data)
mpg_test_data_std = std_scale.transform(mpg_test_data)
print()
for i in range(3,9):
    regr = KNeighborsRegressor(n_neighbors=i)
    regr.fit(mpg_train_data_std, mpg_train_target)
    mpg_prediction = regr.predict(mpg_test_data_std)
    accuracy = regr.score(mpg_test_data_std, mpg_test_target)
    #print("When k = {} Accuracy: {:.2f}%".format(i, accuracy * 100))
    
#The Student Math Data
    
binary = {"school" : {"GP":1, "MS":0},
          "sex" : {"M":1, "F":0},
          "address" : {"R":1, "U":0},
          "famsize" : {"GT3":1, "LE3":0},
          "Pstatus" : {"T":1, "A":0},
          "schoolsup" : {"yes":1, "no":0},
          "famsup" : {"yes":1, "no":0},
          "paid" : {"yes":1, "no":0},
          "activities" : {"yes":1, "no":0},
          "nursery" : {"yes":1, "no":0},
Esempio n. 44
0
data = load_boston()
# 分割数据
train_x, test_x, train_y, test_y = train_test_split(data.data,
                                                    data.target,
                                                    test_size=0.25,
                                                    random_state=33)
# 使用 AdaBoost 回归模型
regressor = AdaBoostRegressor()
regressor.fit(train_x, train_y)
pred_y = regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
# 使用决策树回归模型
dec_regressor = DecisionTreeRegressor()
dec_regressor.fit(train_x, train_y)
pred_y = dec_regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print(" 决策树均方误差 = ", round(mse, 2))
'''
 决策树均方误差 =  28.19
'''
# 使用 KNN 回归模型
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(train_x, train_y)
pred_y = knn_regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print("KNN 均方误差 = ", round(mse, 2))
'''
KNN 均方误差 =  27.87

'''
Esempio n. 45
0
# Create numpy arrays
x = np.asarray(x, dtype=float)
y = np.asarray(y, dtype=float)
forcast = np.asarray(forecast, dtype=float)

# Give data the correct dimensions
x, y, forecast = x.reshape(len(x),
                           1), y.reshape(len(y),
                                         1), forcast.reshape(len(forcast), 1)

# Train
knnReg = KNeighborsRegressor(10, 'uniform').fit(x, y)

# Predict
prediction = knnReg.predict(forecast)

# Write to file
printlist = read.convert(prediction)
read.writeToFile("ForecastTemplate1-kNN.csv", dates, printlist)

# Calculate RMSE
sum_errors = 0
for i in range(len(prediction)):
    sum_errors += math.pow(2, (float(prediction[i]) - float(solution[i])))

rmse = math.sqrt(sum_errors / len(prediction))

print(" ")
print("Prediction done using K-Nearest Neighbor")
print("RMSE: " + str(rmse))
Esempio n. 46
0
from sklearn.tree import DecisionTreeRegressor

N = 200
X = np.linspace(0, 10, N).reshape(N, 1)
Y = np.sin(X)

Ntrain = 20
idx = np.random.choice(N, Ntrain)
Xtrain = X[idx]
Ytrain = Y[idx]

# it weights the neighbors by 'distance' instead
# of just averaging the neighbors

knn = KNeighborsRegressor(n_neighbors=2, weights='distance')
knn.fit(Xtrain, Ytrain)
knn = knn.predict(X)

# because we didnt set max_depth of the tree during
# the training it overfit the training data
dt = DecisionTreeRegressor()
dt.fit(Xtrain, Ytrain)
Ydt = dt.predict(X)

plt.scatter(Xtrain, Ytrain)  # show the training points
plt.plot(X, Y)  # show the original data
plt.plot(X, Yknn, label='KNN')
plt.plot(X, Ydt, label='Decision Tree')
plt.legend()
plt.show()
Esempio n. 47
0
#%% kNN Regression - Finding optimal k
mse_mean_list = []
mse_var_list = []
r2_mean_list = []
r2_var_list = []
gamma = 25
k_space = []

kf = KFold(n_splits=10, shuffle=True)
for k in range(2, 10, 1):
    mse_list = []
    k_space.append(k)
    for train, test in kf.split(X):
        model_knn = KNeighborsRegressor(n_neighbors=k, weights='distance').fit(
            X[train], Y[train])
        ypred_knn = model_knn.predict(X[test])
        mse_list.append(mean_squared_error(Y[test], ypred_knn))
        r2_list.append(r2_score(Y[test], ypred_knn))

    mse_mean_list.append(np.mean(mse_list))
    mse_var_list.append(np.var(mse_list))

    r2_mean_list.append(np.mean(r2_list))
    r2_var_list.append(np.var(r2_list))
#%%
fig03 = plt.figure(figsize=(10, 5))
ax03 = fig03.add_subplot(1, 1, 1)
Ctext03 = "Variation in Mean Squared Error with $k$ for kNN Regression, weighted by distance"
ax03.set_title(Ctext03, fontweight="bold", fontsize=13)
ax03.set_xlabel('Number of Nearest Neighbours ($k$)',
                fontweight="bold",
#     print("The MSE is:", a, k)

#5 MSE = 33.529

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

knnr = KNeighborsRegressor(n_neighbors=5, p=2)
knnr.fit(X_train, y_train)
#y_pred = classifier.predict(X_test)
y_pred = knnr.predict(X_test)

a = mean_squared_error(y_test, y_pred)
print("The MSE is:", a)

# Vote share of incumbent president

fig = plt.figure(figsize=(12, 9))
ax = sns.regplot(y_test, y_pred, marker='o', color='blue')
ax.set_title('KNN Regression', fontsize=20)
ax.set_xlabel('Actual Democratic Vote Share', fontsize=20)
ax.set_ylabel('Predicted Democratic Candidate Vote Share', fontsize=20)
plt.show()

#>> Final Prediction <<
#Predict out of sample data
Esempio n. 49
0
#mdl2 = KNeighborsRegressor(5);
#mdl2.fit(X2,Y2);

#print(mdl2.score(tX2,tY2))
#print(mdl2.score(X2,Y2))
#print(mdl.score(tX2,tY2))

#plt.scatter(T,P, color='blue')
#plt.plot([0,40],[b,m*40+b],'r')
#plt.title('Vasai Median Prices', fontsize = 15)
#plt.xlabel('Quarter (0 = 2009-Q1, 40 = 2022-Q2)', fontsize = 15)
#plt.ylabel('rs/sq.ft', fontsize = 15)
#plt.show()

tYY = tY.values
hy = mdl.predict(tX)
dtot = 0
tot = 0
mae_num = 0
mae_den = 0
mape = 0

for i in range(0, 1026):

    cu = 0
    if (tYY[i][0] > hy[i][0]):
        cu = (tYY[i][0] - hy[i][0])
    else:
        cu = -(tYY[i][0] - hy[i][0])
    mae_num = mae_num + cu
    mae_den = mae_den + tYY[i][0]
Esempio n. 50
0
#X_test, y_test = X[offset:], y[offset:]

# We will change k from 1 to 30
k_range = arange(1, 30)
train_err = zeros(len(k_range))
test_err = zeros(len(k_range))

for i, k in enumerate(k_range):
	# Set up a KNN model that regressors over k neighbors
    neigh = KNeighborsRegressor(n_neighbors=k)
    
    # Fit the learner to the training data
    neigh.fit(X_train, y_train)

	# Find the MSE on the training set
    train_err[i] = mean_squared_error(y_train, neigh.predict(X_train))
    # Find the MSE on the testing set
    test_err[i] = mean_squared_error(y_test, neigh.predict(X_test))

# Plot training and test error as a function of k
pl.figure()
pl.title('kNN: Error as a function of k')
pl.plot(k_range, test_err, lw=2, label = 'test error')
pl.plot(k_range, train_err, lw=2, label = 'training error')
pl.legend()
pl.xlabel('k')
pl.ylabel('RMS error')
pl.show()

# <headingcell level=3>
Esempio n. 51
0
thirdAirline = input('-->')

# Setting up data and target values
data = dataset.iloc[:, [4, 5, 7, 8, 15]]
target = dataset.iloc[:,10:12].values

# Unique test cases
predictX = [
						[35.220448, -80.94377, 40.77289, -73.868805, airlineMap[firstAirline]],
						[47.44359, -122.302505, 33.640545, -84.43341, airlineMap[secondAirline]],
						[40.69297, -74.17799, 37.616714, -122.38709, airlineMap[thirdAirline]],
						]

# Here we are using a KNN Regression to deal with the multitarget output
# If there was not a multitarget output, a normal KNN Classifier would have worked fine
# Using 1 cluster to guarantee that the resulting layover shows an actual airport location
knn = KNeighborsRegressor(n_neighbors = 1)
knn.fit(data, target)


predictionResults = knn.predict(predictX)

# Demonstration of the results
for i in range(3):
	
	if airportMap[predictionResults[i][0]] == 'ZZZ' || :
		airportMap[predictionResults[i][0]] = 'no airport!'

print("Flying from CLT to LGA on", firstAirline, "would likely result in a layover in",airportMap[predictionResults[0][0]])
print("Flying from SEA to ATL on", secondAirline, "would likely result in a layover in",airportMap[predictionResults[1][0]])
print("Flying from EWR to SFO on", secondAirline, "would likely result in a layover in",airportMap[predictionResults[2][0]])
Esempio n. 52
0
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#KNN REGRESSOR
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=10, metric='chebyshev')
neigh.fit(X_train, y_train)
knn_pred = neigh.predict(X_test)
mean_squared_error(y_test, knn_pred)

#random forest
from sklearn.ensemble import RandomForestRegressor

regressor1 = RandomForestRegressor(n_estimators=250, random_state=0)
regressor1.fit(X_train, y_train)
y_pred2 = regressor1.predict(X_test)
mean_squared_error(y_test, y_pred2)

from sklearn.linear_model import LinearRegression

regressor2 = LinearRegression()
regressor2.fit(X_train, y_train)
y_pred3 = regressor2.predict(X_test)
Esempio n. 53
0
df = pd.read_csv('dataset.csv', delimiter=',', decimal=',')
df = df.dropna()

print(df.columns)

Y = df['% Silica Concentrate']
X = df.drop(['% Silica Concentrate', 'date'], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

start_time = time.time()

model = KNeighborsRegressor(n_neighbors=5)

model.fit(X_train, Y_train)

end_time = time.time()

print("Time elapsed: ", end_time - start_time)

Y_pred = model.predict(X_test)

print("Time elapsed: ", time.time() - end_time)

error = mean_squared_error(Y_pred, Y_test)

print(error)
def run_models(grid_y, grid_x):
    X, Y = create_training_and_testing_data(grid_x, grid_y)
    data = Table(X, Y)
    # print(data.Y)
    # np.savetxt('data/' + str(grid_x) + '_' + str(grid_y) + '.csv', np.array(data), delimiter=',', fmt='%10.5f')
    # print(out_data.domain)
    # print(out_data.Y)

    # feature_method = og.preprocess.score.UnivariateLinearRegression()
    # selector = og.preprocess.SelectBestFeatures(method=feature_method, k=10)
    # out_data2 = selector(data)
    # plot_input(out_data2.X, out_data2.Y)
    # print(out_data2.domain)

    # pca = PCA(n_components=5)
    # model = pca(out_data2)
    # out_data = model(out_data2)
    # print(out_data.domain)

    test = og.data.Table(data.domain, random.sample(data, 60))
    train = og.data.Table(data.domain, [d for d in data if d not in test])

    lin = og.regression.linear.LinearRegressionLearner()
    rf = og.regression.random_forest.RandomForestRegressionLearner()
    nnr = og.regression.NNRegressionLearner()
    svm = og.regression.SVRLearner()
    knn = KNeighborsRegressor(n_neighbors=3)

    learners = [lin, rf, nnr, svm]
    regressors = [learner(train) for learner in learners]
    knn.fit(train.X, train.Y)

    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_lin.pickle", "wb") as f:
        pickle.dump(lin, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_rf.pickle", "wb") as f:
        pickle.dump(rf, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_nnr.pickle", "wb") as f:
        pickle.dump(nnr, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_svm.pickle", "wb") as f:
        pickle.dump(svm, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_knn.pickle", "wb") as f:
        pickle.dump(knn, f)

    # print((r(test)[0] for r in regressors))
    linPredict = regressors[0](test)
    rfPredict = regressors[1](test)
    nnrPredict = regressors[2](test)
    svmPredict = regressors[3](test)
    knnPredict = knn.predict(test.X)

    predictions = []
    predictions.append(linPredict)
    predictions.append(rfPredict)
    predictions.append(nnrPredict)
    predictions.append(svmPredict)
    predictions.append(knnPredict)

    # print(knnPredict)

    # print("y   ", " ".join("%5s" % l.name for l in regressors))
    # for d in test:
    #     print(("{:<5}" + " {:5.1f}" * len(regressors)).format(d.get_class(), *(r(d)[0] for r in regressors)))

    # res = og.evaluation.CrossValidation(test, learners, k=10)
    # rmse = og.evaluation.RMSE(res)
    # mae = og.evaluation.MAE(res)
    # r2 = og.evaluation.R2(res)

    rmse = []
    mae = []
    rmse.append(math.sqrt(mean_squared_error(test.Y, linPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, rfPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, nnrPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, svmPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, knnPredict)))

    mae.append(mean_absolute_error(test.Y, linPredict))
    mae.append(mean_absolute_error(test.Y, rfPredict))
    mae.append(mean_absolute_error(test.Y, nnrPredict))
    mae.append(mean_absolute_error(test.Y, svmPredict))
    mae.append(mean_absolute_error(test.Y, knnPredict))

    return np.array(mae), np.array(rmse), np.array(predictions), test
Esempio n. 55
0
def run_baseline_model(epa_data, modis_means):

    dates = set()
    for date in epa_data['Date']:
        dates.add(date)

    MSE = 0
    num_predictions = 0

    all_date_y_train = []
    all_date_y_test = []
    all_date_y_pred = []
    all_date_y_train_preds = []
    all_date_epa_site_train_order = []
    all_date_epa_site_test_order = []
    all_dates_train = []
    all_dates_test = []

    # Goes date by date to get MSE
    # Each date has a list of stations that have measuerments from that date

    for idx, date in enumerate(dates):
        if idx % 10 == 0:
            print("Processing date {}: {} ".format(idx, date))

        date_df = epa_data[epa_data['Date'] == date]

        # X info is latitude, longitude; y is PM2.5; epa_set_ids tracks corresponnding site_ids
        X = []
        y = []
        epa_site_ids = []
        cur_date = []

        for i in range(len(date_df)):
            lat = np.radians(date_df['SITE_LATITUDE'][date_df.index[i]])
            long = np.radians(date_df['SITE_LONGITUDE'][date_df.index[i]])
            pm = date_df['Daily Mean PM2.5 Concentration'][date_df.index[i]]
            epa_site_id = date_df['Site ID'][date_df.index[i]]
            X.append([lat, long])
            y.append(pm)
            epa_site_ids.append(epa_site_id)
            cur_date.append(date)

        # Shuffle data and split into train/test sets
        X, y, epa_site_ids = shuffle(X, y, epa_site_ids)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, shuffle=False)  # already shuffled
        X_train_, X_test_, epa_site_train, epa_site_test = train_test_split(
            X, epa_site_ids, test_size=0.3, shuffle=False)
        _, _, cur_date_train, cur_date_test = train_test_split(X,
                                                               cur_date,
                                                               test_size=0.3,
                                                               shuffle=False)

        # nearest neighbors, as determined by haversine (distance between latitude,longitude coordinate pairs)
        knn = KNeighborsRegressor(n_neighbors=1, metric="haversine")
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)

        # Combine PM prediction from nearest neighbor with 2x2 aod data in simple linear regression model

        # Get the nearest neighbors of train data (not including point itself)
        y_train_nn_indices = knn.kneighbors(X_train)[1]
        y_train_nn_indices = [y for x in y_train_nn_indices
                              for y in x]  # flatten the list
        y_train_preds = np.asarray(y_train)[y_train_nn_indices]

        all_date_y_train_preds.append(y_train_preds.tolist())
        all_date_y_pred.append(y_pred)
        all_date_y_train.append(y_train)
        all_date_y_test.append(y_test)
        all_date_epa_site_train_order.append(epa_site_train)
        all_date_epa_site_test_order.append(epa_site_test)
        all_dates_train.append(cur_date_train)
        all_dates_test.append(cur_date_test)

    # Flatten list all lists
    all_date_y_train_preds = flatten(all_date_y_train_preds)
    all_date_y_pred = flatten(all_date_y_pred)
    all_date_y_train = flatten(all_date_y_train)
    all_date_y_test = flatten(all_date_y_test)
    all_date_epa_site_train_order = flatten(all_date_epa_site_train_order)
    all_date_epa_site_test_order = flatten(all_date_epa_site_test_order)
    all_dates_train = flatten(all_dates_train)
    all_dates_test = flatten(all_dates_test)

    X_aod_train = np.asarray(all_date_y_train_preds).reshape(-1, 1)
    X_aod_test = np.asarray(all_date_y_pred).reshape(-1, 1)

    num_sites_for_all_dates_train = len(all_date_epa_site_train_order)
    num_sites_for_all_dates_test = len(all_date_epa_site_test_order)

    green_means_train = np.zeros((num_sites_for_all_dates_train, 1))
    blue_means_train = np.zeros((num_sites_for_all_dates_train, 1))
    green_means_test = np.zeros((num_sites_for_all_dates_test, 1))
    blue_means_test = np.zeros((num_sites_for_all_dates_test, 1))

    print("Beginning mean lookup")

    for idx, epa_site in enumerate(all_date_epa_site_train_order):
        modis_filename = epa_to_modis_file_name(all_dates_train[idx], epa_site)
        modis_row = modis_means[modis_means['Filename'] == modis_filename]
        green_mean = modis_row['Green mean'][modis_row.index[0]]
        blue_mean = modis_row['Blue mean'][modis_row.index[0]]
        green_means_train[idx] = green_mean
        blue_means_train[idx] = blue_mean

    for idx, epa_site in enumerate(all_date_epa_site_test_order):
        modis_filename = epa_to_modis_file_name(all_dates_test[idx], epa_site)
        modis_row = modis_means[modis_means['Filename'] == modis_filename]
        green_mean = modis_row['Green mean'][modis_row.index[0]]
        blue_mean = modis_row['Blue mean'][modis_row.index[0]]
        green_means_test[idx] = green_mean
        blue_means_test[idx] = blue_mean

    print("Finished mean lookup")

    X_aod_train = np.concatenate(
        (X_aod_train, green_means_train, blue_means_train), axis=1)
    X_aod_test = np.concatenate(
        (X_aod_test, green_means_test, blue_means_test), axis=1)

    print("Training LR")
    reg = LinearRegression().fit(X_aod_train, all_date_y_train)

    r2_score_train = reg.score(X_aod_train, all_date_y_train)
    r2_score_test = reg.score(X_aod_test, all_date_y_test)

    print("R2 train: {}".format(r2_score_train))
    print("R2 test: {}".format(r2_score_test))

    y_pred_lr = reg.predict(X_aod_test)

    diff = np.square(np.asarray(y_pred_lr) - np.asarray(all_date_y_test))
    MSE = diff.sum()
    num_predictions = len(all_date_y_test)
    #print("Adding squared error of {} for date {}.".format(diff.sum()/len(y_test), date))

    MSE = MSE / num_predictions
    print("Mean squared error across all dates:  {}".format(MSE))
Esempio n. 56
0
    df['healsPerWalkDistance'].fillna(0, inplace=True)
    df['healsAndBoostsPerWalkDistance'] = df['healsAndBoosts'] / (
        df['walkDistance'] + 1)
    df['healsAndBoostsPerWalkDistance'].fillna(0, inplace=True)

    df['killsPerWalkDistance'] = df['kills'] / (df['walkDistance'] + 1)
    df['killsPerWalkDistance'].fillna(0, inplace=True)

    return df


train = addFeatures(pd.read_csv('inputs/train_V2.csv'))
test = addFeatures(pd.read_csv('inputs/test_V2.csv'))

from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=3)

neigh.fit(
    train[[
        'weaponsAcquired', 'killPlace', 'totalDistance',
        'killsPerWalkDistance', 'healsAndBoostsPerWalkDistance'
    ]][:700000], train['winPlacePerc'][:700000])
predcited = neigh.predict(train[[
    'weaponsAcquired', 'killPlace', 'totalDistance', 'killsPerWalkDistance',
    'healsAndBoostsPerWalkDistance'
]][800000:890000])

from sklearn.metrics import explained_variance_score
EVS = explained_variance_score(train['winPlacePerc'][800000:890000], predcited)
print(EVS)
Esempio n. 57
0
[[10191     0     0     0     0     0]
 [    1    11     0     0     0     0]
 [    0     0    43     0     0     0]
 [    3     0     2  2065     0     0]
 [    0     0     0     0   731     0]
 [    0     0     0     0     0     2]]

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model_DT,x_test_std,y_test)

## for confusion matrix plot, refer to coding on Google Colab ##

#4) Prediction on unknown data

#4.1) Using Predict() function with Decision Trees
from sklearn.tree import DecisionTreeRegressor
model_DTR = DecisionTreeRegressor(max_depth=5).fit(x_train,y_train)
DT_predict = model_DTR.predict(x_test) #Predictions on Testing data
print(DT_predict)

## [1.0020645 4.        1.0020645 ... 1.0020645 1.0020645 1.       ]

#4.2) Using Predict() function with KNN
from sklearn.neighbors import KNeighborsRegressor
KNN_model = KNeighborsRegressor(n_neighbors=3).fit(x_train,y_train)

## [1. 4. 1. ... 1. 1. 1.]
KNN_predict = KNN_model.predict(x_test) #Predictions on Testing data
print(KNN_predict)
'''
Created on 2017. 8. 6.

@author: jaehyeong
'''
import mglearn
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

# dataset
X, y = mglearn.datasets.make_wave(n_samples=40)

# train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

# k = 3, 객체생성
reg = KNeighborsRegressor(n_neighbors=3)
#  학습
reg.fit(X_train, y_train)

# test셋에 대한 예측
print('test set 예측 : ',reg.predict(X_test))
# score( R^2 : 결정계수 )
print('test set R^2 : ', reg.score(X_test, y_test))     # 0.834417244625
Esempio n. 59
0
def regression_data():

    cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF', '#000000'])
    plt.figure(1)
    plt.title('Sample regression problem with one input variable')
    X_R1, y_R1 = make_regression(n_samples=100,
                                 n_features=1,
                                 n_informative=1,
                                 bias=150.0,
                                 noise=30,
                                 random_state=0)
    plt.scatter(X_R1, y_R1, marker='o', s=50)
    plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X_R1,
                                                        y_R1,
                                                        random_state=0)
    knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)
    print(knnreg.predict(X_test))
    print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))

    fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))
    X_predict_input = np.linspace(-3, 3, 50).reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5],
                                                        y_R1[0::5],
                                                        random_state=0)

    for thisaxis, K in zip(subaxes, [1, 3]):
        knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
        y_predict_output = knnreg.predict(X_predict_input)
        thisaxis.set_xlim([-2.5, 0.75])
        thisaxis.plot(X_predict_input,
                      y_predict_output,
                      '^',
                      markersize=10,
                      label='Predicted',
                      alpha=0.8)
        thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)
        thisaxis.set_xlabel('Input feature')
        thisaxis.set_ylabel('Target value')
        thisaxis.set_title('KNN regression (K={})'.format(K))
        thisaxis.legend()
    plt.tight_layout()
    plt.show()

    fig, subaxes = plt.subplots(5, 1, figsize=(5, 20))
    X_predict_input = np.linspace(-3, 3, 500).reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X_R1,
                                                        y_R1,
                                                        random_state=0)

    for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):
        knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
        y_predict_output = knnreg.predict(X_predict_input)
        train_score = knnreg.score(X_train, y_train)
        test_score = knnreg.score(X_test, y_test)
        thisaxis.plot(X_predict_input, y_predict_output)
        thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
        thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')
        thisaxis.set_xlabel('Input feature')
        thisaxis.set_ylabel('Target value')
        thisaxis.set_title(
            'KNN Regression (K={}) Train $R^2 = {:.3f}$,  Test $R^2 = {:.3f}$'.
            format(K, train_score, test_score))
        thisaxis.legend()
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    plt.show()
Esempio n. 60
0
    lambda row: distance.euclidean(row, lebron_normalized), axis=1)
sort_distances = euclidean_distances.sort_values()
closest = sort_distances.iloc[1:2]
most_similar_to_lebron = nba.iloc[closest.index[0]]['player']
print(most_similar_to_lebron)

## 7. Using sklearn ##

# The columns that we will be making predictions with.
x_columns = [
    'age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p',
    'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast',
    'stl', 'blk', 'tov', 'pf'
]
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

## 8. Computing error ##

actual = test[y_column]
mse = (((predictions - actual)**2).sum()) / len(predictions)
print(mse)