Example #1
1
def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5):
	model = KNeighborsRegressor(n_neighbors=n_neighbors)
	model.fit(features_train, labels_train)
	labels_pred = model.predict(features_pred)
	score = model.score(features_train, labels_train)
	print "KNeighbors - coefficient of determination R^2 of the prediction: ", score
	return labels_pred
Example #2
1
def fill_income(df):

    income_imputer = KNeighborsRegressor(n_neighbors=2)
    df_w_monthly_income = df[df.monthly_income.isnull() == False].copy()
    df_w_null_monthly_income = df[df.monthly_income.isnull() == True].copy()
    cols = ["number_real_estate_loans_or_lines", "number_of_open_credit_lines_and_loans"]
    income_imputer.fit(df_w_monthly_income[cols], df_w_monthly_income.monthly_income)
    new_values = income_imputer.predict(df_w_null_monthly_income[cols])
    df_w_null_monthly_income.loc[:, "monthly_income"] = new_values
    df2 = df_w_monthly_income.append(df_w_null_monthly_income)
    return df2
def main(featureFile, outputfolder):
    with open(featureFile, 'r') as csvfile:
        my_data = pd.read_csv(csvfile, delimiter="\t", low_memory=False)

    random_indices = permutation(my_data.index)
    # how many time do we want the data in our test set?
    test_cutoff = math.floor(len(my_data)/3)
    test = my_data

    # Generate the training set with the rest of the data.
    train = my_data.loc[random_indices[test_cutoff:]]

    x_columns = ["Row"=="1", "Student ID"=="2", "Problem Hierarchy" == "3", "Problem Name"=="4", "Problem View" == "5", "Step Name" == "6",
            "KC(Default)"=="7", "Opportunity (Default)" == "8"]
    x_columns = [int(i) for i in x_columns]
    # y columns show the predicted feature, in this case, the correct first attempt
    y_column = ["Correct First Attempt"]

    # Look at the Ten closest neighbors, to offset potential noise in the data
    knn = KNeighborsRegressor(n_neighbors=10)
    knn.fit(train[x_columns], train[y_column])

    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(test[x_columns])
    actual = test[y_column]
    result = test[['Anon Student Id','Correct First Attempt']]
    result.to_csv(outputfolder, sep='\t')

    # Compute the root mean squared error of our predictions.
    rmse = math.sqrt((((predictions - actual) ** 2).sum()) / len(predictions))
    print('RMSE=')
    print(rmse)
Example #4
0
def knn_model(train, y_train, test):
    model = KNeighborsRegressor(n_neighbors = 10, weights='distance', n_jobs=-1)
    model.fit(train, y_train)
    test_probs = model.predict(test)
    indices = test_probs < 0
    test_probs[indices] = 0
    return test_probs
Example #5
0
def run_network(mdl=None, data=None):
    global_start_time = time.time()
    sequence_length = 10

    if data is None:
        print('Loading data... ')
        X_train, y_train, X_test, y_test = train_test_traffic_data(15773, sequence_length)
    else:
        X_train, y_train, X_test, y_test = data

    print('\nData Loaded...\n')

    if mdl is None:
        mdl = KNeighborsRegressor(5, weights='distance')

    try:
        mdl.fit(X_train, y_train)
        predicted_trffic = mdl.predict(X_test)
    except KeyboardInterrupt:
        print('Training duration (s) : ', time.time() - global_start_time)
        return mdl, y_test, 0

    print('Training duration (s) : ', time.time() - global_start_time)

    return mdl, y_test, predicted_trffic
Example #6
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_svr = [{
            'n_neighbors': [2, 5, 10, 15]}]
        params = ParameterGrid(params_svr)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            # pdb.set_trace()
            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            X_cv, y_cv = stock.get_data(mid_date, end_date)

            lowest_mse = np.inf
            for i, param in enumerate(params):
                knn = KNeighborsRegressor(**param)
                # ada = AdaBoostRegressor(knn)
                knn.fit(X_train.values, y_train.values)
                mse = mean_squared_error(y_cv, knn.predict(X_cv.values))
                if mse <= lowest_mse:
                    self.models[ticker] = knn

        return self
Example #7
0
    def train(self, x, y, param_names, random_search=100, **kwargs):
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        n_neighbors = self._random_search(random_iter=100, x=scaled_x, y=y)

        # Now train model
        knn = KNeighborsRegressor(n_neighbors=n_neighbors,
                                  weights='uniform',
                                  algorithm='auto',
                                  leaf_size=30,
                                  p=2,
                                  metric='minkowski')
        knn.fit(scaled_x, y)
        self._model = knn

        duration = time.time() - start
        self._training_finished = True
        return duration
Example #8
0
def calc_linear_regression(reg_training_path):
    dataset = read_reg_train_data(reg_training_path)
    rmse = 0
    n_folds = 5
    folds = KFold(n=len(dataset), n_folds=n_folds, shuffle=False)

    fold = 0
    for train_indices, test_indices in folds:
        fold += 1
        training_set = [dataset[i] for i in train_indices]
        test_set = [dataset[i] for i in test_indices]
        training_dataframe = get_data_frame(training_set)
        test_dataframe = get_data_frame(test_set)
        column_names = ['cf_item', 'cf_user', 'svd', 'content_item', 'actual_rating']
        training_dataframe.columns = column_names
        test_dataframe.columns = column_names

        actual_rating_training_column = training_dataframe['actual_rating']
        #actual_rating_test_column = test_dataframe['actual_rating']

        training_dataframe = training_dataframe.drop('actual_rating', axis=1)
        test_dataframe = test_dataframe.drop('actual_rating', axis=1)

        neigh = KNeighborsRegressor(n_neighbors=10)
        #print('Initialized k nearest neighbors regressor with k =', i)
        neigh.fit(training_dataframe, actual_rating_training_column)
        #print('Fit data models')
        predict_set = neigh.predict(test_dataframe)
        print(predict_set)
        rmse += mean_squared_error([rec[4] for rec in test_set], [rec for rec in predict_set]) ** 0.5
        print("Fold (%d) finished with accumulated RMSE of (%f) (%s)" % (fold, rmse, time.strftime('%y_%m_%d_%H_%M_%S')))
    return rmse / float(n_folds)
def run_kNeighbors(distances, loadings, test_vars, 
                   weightings=('uniform',), k_list=(3)):
    """
    Run Knearest neighbor using precomputed distances to create an ontological mapping
    
    Args:
        distances: square distance matrix to pass to KNeighborsRegressors
        loadings: loading matrix for training
        test_vars: variable to reconstruct
        weightings: (optional) list of weightings to pass to KNeighbors
        k_list: list of k values to pass to KNeighbors as n_neighbors
    """
    train_distances = distances.loc[loadings.index, loadings.index]
    test_distances = distances.loc[test_vars, loadings.index]
    to_return = pd.DataFrame()
    for weighting in weightings:
        for k in k_list:
            clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting)
            clf.fit(train_distances, loadings)
            out = clf.predict(test_distances)
            out = pd.DataFrame(out, columns=loadings.columns)
            out['var'] = test_vars
            out['k'] = k
            out['weighting'] = weighting
            # add neighbors and distances
            neighbors = clf.kneighbors(test_distances)
            out['distances'] = tuple(neighbors[0])
            out['neighbors'] = tuple(test_distances.columns[neighbors[1]])
            to_return = pd.concat([to_return, out], sort=False)
    return to_return
def apply_knn():
    regr = KNeighborsRegressor()
    regr.fit(Xtr, Ytr)

    pred = regr.predict(Xte)
    temp = mean_squared_error(Yte, pred)
    return pred, temp
Example #11
0
def kNN(X_train, y_train, X_test, y_test, uselog=False):
  '''

  :param X_train:
  :param y_train:
  :param X_test:
  :param y_test:
  :return:
  '''

  scaler = StandardScaler()
  print X_train.shape
  print X_test.shape

  X = scaler.fit_transform(X_train)
  test = scaler.transform(X_test)

  clf = KNeighborsRegressor(n_neighbors=550)

  clf.fit(X, y_train)

  result = clf.predict(test)

  if uselog:
    result = map(lambda x: math.log(1 + x), result)

  return result
    def transform(self, X, y=None):
        """
        :param X: multidimensional numpy array like.
        """
        rows, features = X.shape

        mask = list(map(lambda x: reduce(lambda h, t: h or t, x), np.isnan(X)))
        criteria_for_bad = np.where(mask)[0]
        criteria_for_good = np.where(mask == np.zeros(len(mask)))[0]

        X_bad = X[criteria_for_bad]
        X_good = X[criteria_for_good]

        knn = KNeighborsRegressor(n_neighbors=self.k)

        for idx, x_bad in zip(criteria_for_bad.tolist(), X_bad):
            missing = np.isnan(x_bad)
            bad_dim = np.where(missing)[0]
            good_dim = np.where(missing == False)[0]

            for d in bad_dim:
                x = X_good[:, good_dim]
                y = X_good[:, d]
                knn.fit(x, y)

                X[idx, d] = knn.predict(x_bad[good_dim])

        return X
Example #13
0
 def smooth(self, X, y):
   # KNN algorithm for smooth
   nbrs = KNeighborsRegressor(n_neighbors = 20)
   X = X.reshape(-1, 1)
   nbrs.fit(X, y)
   proba = nbrs.predict(X)
   return proba
def k_nearest_neighbours():
	filepath = "bondchanges.arff"
	all_data = arff_read_to_array(filepath)
	X_data = all_data["data"]
	Y_data = all_data["target"]
	Y_data_map = {}
        new_Y_data = np.array([])
        i = 01
	for index,data in enumerate(Y_data):
		data1 = data.split('_')[0]
		split_data = (".").join(data1.split('.')[:1])
		if not split_data in  Y_data_map:
			Y_data_map[split_data] = i
                        i+=1
 		print split_data
                new_Y_data = np.append(new_Y_data,[Y_data_map[split_data]],0) #Create
	X_training = X_data[:0.9*len(X_data)]
	Y_training = new_Y_data[:0.9*len(Y_data)]
	print X_training
	print 
        print Y_training
	X_test = X_data[0.9*len(X_data):]
	Y_test = new_Y_data[0.9*len(Y_data):]
	#svc = svm.SVC(C=1, kernel='')
	knn = KNeighborsClassifier() 
	knnr= KNeighborsRegressor(n_neighbors=20000)
	print knnr.fit(X_training, Y_training).score(X_test,Y_test)
Example #15
0
def Round2(X, y):
    # Set parameters
    min_score = {}
    for neigh in [5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:

        model = KNeighborsRegressor(n_neighbors=neigh)
        n = len(y)

        # Perform 5-fold cross validation
        scores = []
        kf = KFold(n, n_folds=5, shuffle=True)

        # Calculate mean absolute deviation for train/test for each fold
        for train_idx, test_idx in kf:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, prediction))
            # score = model.score(X_test, y_test)
            scores.append(rmse)
        if len(min_score) == 0:
            min_score['neighbor'] = neigh
            min_score['scores'] = scores
        else:
            if np.mean(scores) < np.mean(min_score['scores']):
                min_score['neighbor'] = neigh
                min_score['scores'] = scores
        print "Neighbors:", neigh
        print scores
        print np.mean(scores)
    return min_score
Example #16
0
	def __init__(self,dataFrame):
		self.dataFrameKNN = {}
		self.KNNWeightage = {'Avg-High Ratio':100,'Avg-Low Ratio':100,'Deliverable Qty':300,'Turnover':100,'Growth':150,'Trend':100,'Output':100}
		self.valid = True
		self.KNNModelHash = {}
		self.dataFrameKNN = pd.DataFrame()
		self.dataFrameKNN['Avg-High Ratio'] = dataFrame['High Price'][1:] - dataFrame['Average Price'][1:]
		self.dataFrameKNN['Avg-Low Ratio'] = dataFrame['Average Price'][1:] - dataFrame['Low Price'][1:]
		self.dataFrameKNN['Deliverable Qty'] = dataFrame['Deliverable Qty'][1:]
		self.dataFrameKNN['Turnover'] = dataFrame['Turnover in Lacs'][1:]
		self.dataFrameKNN['Growth'] = dataFrame['Close Price'][1:]-dataFrame['Prev Close'][1:]
		self.dataFrameKNN['Trend'] = dataFrame['Turnover in Lacs'][1:]
		self.dataFrameKNN['Output'] = dataFrame['High Price'][1:]-dataFrame['Prev Close'][1:]
		self.KNNModelHash['mean'] = self.dataFrameKNN['Output'].mean()
		self.KNNModelHash['std'] = self.dataFrameKNN['Output'].std()
		for key in self.dataFrameKNN:
			self.normalizeKNNModel(key)
		#trainData has the data to be trained, but the last data is the testData
		trainData =	self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][:-1].values
		testData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][-1:].values
		#trainOutput contains the output corresponding to train Data but the first one is garbage
		trainOutput = self.dataFrameKNN['Output'][1:].values
		KNNModel = KNeighborsRegressor(n_neighbors=3,weights = 'distance')
		KNNModel.fit(trainData[100:400],trainOutput[100:400])
		prediction = KNNModel.predict(trainData[400:450])
		weightage = self.KNNWeightage['Output']
		for i in range(50):
			prediction[i] = ((prediction[i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage
			trainOutput[400+i] = ((trainOutput[400+i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage
			print "%-40s %-40s " %(prediction[i],trainOutput[400+i])
Example #17
0
def calculateKNearestNeighborsModel(data, numberOfNeighbors):
	# Select input variables as x and typecast to numpy array
	x = np.array(data.iloc[0:,0:11])
	# Select output variable (quality) as y and typecast to numpy array
	y = np.array(data.quality)
	neighbors = KNeighborsRegressor(n_neighbors=numberOfNeighbors)
	neighbors.fit(x, y)
	return neighbors
Example #18
0
    def predictDayType (self,week,day):
        
        knn = KNeighborsRegressor(n_neighbors=5)
        knn.fit(self.rawData, self.dayType)

        X = np.array([week,day])   
        predictions = knn.predict(X)
        return predictions
Example #19
0
class ModelNNReg(ScikitPredictor):
    '''Nearest neighbor regression'''

    def generate_model(self):
        self.model = KNeighborsRegressor(**self.model_kwargs)

    def fit_model(self, x, y):
        self.model.fit(x, y)
Example #20
0
def nnVerify_2(city_data,x,y):
    """ Using SKLearn's KNeighborsRegressor """
    X,Y = city_data.data, city_data.target
    clf = KNeighborsRegressor(n_neighbors=2)
    clf.fit(X,Y)
    y_pred = clf.predict(x)
    print("KNeighborsRegressor")
    print("Y pred(KNN) : ", y_pred)
Example #21
0
def main():
    # read the images
    image_from = io.imread(name_from) / 256
    image_to = io.imread(name_to) / 256

    # change to hsv domain (if requested)
    if args.use_hsv:
        image_from[:] = rgb2hsv(image_from)
        image_to[:] = rgb2hsv(image_to)

    # get shapes
    shape_from = image_from.shape
    shape_to = image_to.shape

    # flatten
    X_from = im2mat(image_from)
    X_to = im2mat(image_to)

    # number of pixes
    n_pixels_from = X_from.shape[0]
    n_pixels_to = X_to.shape[0]

    # subsample
    X_from_ss = X_from[np.random.randint(0, n_pixels_from-1, n_pixels),:]
    X_to_ss = X_to[np.random.randint(0, n_pixels_to-1, n_pixels),:]

    if save_col_distribution:
        import matplotlib.pyplot as plt
        import seaborn as sns
        sns.set_style('white')

        fig, axes = plt.subplots(nrows=2, figsize=(5, 10))
        for ax, X in zip(axes, [X_from_ss, X_to_ss]):
            ax.scatter(X[:,0], X[:,1], color=X)
            if args.use_hsv:
                ax.set_xhsvel('hue')
                ax.set_yhsvel('value')
            else:
                ax.set_xhsvel('red')
                ax.set_yhsvel('green')
        axes[0].set_title('distr. from')
        axes[1].set_title('distr. to')
        fig.tight_layout()
        fig.savefig('color_distributions.png')

    # optimal tranportation
    ot_color = OptimalTransport(X_to_ss, X_from_ss, lam=lam,
                                    distance_metric=distance_metric)

    # model transfer
    transfer_model = KNeighborsRegressor(n_neighbors=n_neighbors)
    transfer_model.fit(X_to_ss, n_pixels * ot_color.P @ X_from_ss)
    X_transfered = transfer_model.predict(X_to)

    image_transferd = minmax(mat2im(X_transfered, shape_to))
    if args.use_hsv:
        image_transferd[:] = hsv2rgb(image_transferd)
    io.imsave(name_out, image_transferd)
Example #22
0
class Knn(ContextEngineBase):
    y_Test = np.empty([0])
    # Knn object
    knnRegressor = None

    def __init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict):
        ContextEngineBase.__init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict)
        # Passed parameters
        self.n_neighbors = appFieldsDict['n_neighbors']
        self.weights = appFieldsDict['weights']
        self.algorithm = appFieldsDict['algorithm']
        self.n_jobs = appFieldsDict['n_jobs']
        # Defining a Knn object with given parameters
        self.knnRegressor = KNeighborsRegressor(n_neighbors = self.n_neighbors, 
                                                weights = self.weights,
                                                algorithm = self.algorithm,
                                                n_jobs = self.n_jobs)

    #  Add a set of training observations, with the newInputObsMatrix being a
    #  matrix of doubles, where the row magnitude must match the number of inputs,
    #  and the column magnitude must match the number of observations.
    #  and newOutputVector being a column vector of doubles
    def addBatchObservations(self, newInputObsMatrix, newOutputVector):
        if(len(newInputObsMatrix.shape) == 2 and newInputObsMatrix.shape[1] == self.numInputs
            and newOutputVector.shape[0] == newInputObsMatrix.shape[0]):
            # print("All good!")
            newOutputVector = newOutputVector.ravel()
            i = 0
            for newInputVector in newInputObsMatrix:
                newOutputValue = newOutputVector[i]
                self.addSingleObservation(newInputVector, newOutputValue)
                i += 1
        else:
            print("Wrong dimensions!")

    #  Train the coefficients on the existing observation matrix if there are
    #  enough observations.
    def train(self):
        if (self.numObservations > 0):
            # print("Training started")
            self.knnRegressor.fit(self.observationMatrix, self.outputVector)
            return True
        else:
            print("Not enough observations to train!")
            return False

    #  Execute the trained matrix against the given input observation
    #  inputObsVector is a row vector of doubles
    def execute(self, inputObsVector):
        if(len(inputObsVector) == self.numInputs):
            # print("Begin execute")
            #x_Test = np.vstack((self.x_Test,inputObsVector))
            x_Test = np.reshape(inputObsVector,(1,self.numInputs))
            self.y_Test = self.knnRegressor.predict(x_Test)
            return self.y_Test[0]
        else:
            print("Wrong dimensions, fail to execute")
            return None
Example #23
0
def knn_regressor(features, solutions, verbose=0):
    columns = solutions.columns

    clf = KNeighborsRegressor(n_neighbors=5, weights='distance')

    print('Training Model... ')
    clf.fit(features, solutions)
    print('Done Training')
    return (clf, columns)
Example #24
0
def impute_KNN(df,var,features,k,):
    var_imputer = KNeighborsRegressor(n_neighbors=k)
    df_full = df[df[var].isnull()==False]
    df_null = df[df[var].isnull()==True]
    var_imputer.fit(df_full[features], df_full[var])
    impute = var_imputer.predict(df_null[features])
    df_null[var] = impute
    df = df_full.append(df_null)
    return df
Example #25
0
class kNN():
    '''
        kNN classifier
        -------------
    '''

    def __init__(self,N_i,N_o,k=5,n=20):
        # note: N_o=1 assumed for now
        self.N_i = N_i
        self.n = n
        self.i = 0
        self.k = k
        self.X = zeros((self.n,N_i))
        self.y = zeros((self.n))
        self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance')
        self.c = 0
        #self.error_rate = 0

    def predict(self,x):
        '''
            Predict
            --------------
        '''

        if self.c < 1.:
            print "[Warning!] No training examples!"
            return 0.0
        elif self.c <= self.k:
            dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1)
            i_max = argmax(ind)
            return self.y[i_max]

        return self.h.predict(x)#.reshape(1,-1))

#    def samples_X(self):
#        ''' return samples of the WEIGHTS '''
#        if self.c <= 0:
#            return self.X[0,:]
#        return self.X[0:self.c,:]

    def update(self, x, y):
        '''
            Update
            --------------
        '''
        self.X[self.i,:] = x
        self.y[self.i] = y

        #self.error_rate = (y - self.predict(x))**2

        self.i = (self.i + 1) % self.n

        if self.c < self.n:
            self.c = self.c + 1

        self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
Example #26
0
 def addJKRegionLabels(self):
     data = zip(self.data['RA'],self.data['DEC'])
     randoms = zip(self.randoms['RA'],self.randoms['DEC'])
     
     finder = KMeans(n_clusters=self.config['n_jackknife'])
     self.data_jk_indices = finder.fit_predict(data)
     
     nbrs = KNeighborsRegressor(n_neighbors=1)
     nbrs.fit(data,self.data_jk_indices)
     self.random_jk_indices = nbrs.predict(randoms)
Example #27
0
File: data.py Project: dssg/drain
def nearest_neighbors_impute(df, coordinate_columns, data_columns, knr_params={}):
    from sklearn.neighbors import KNeighborsRegressor
    for column in data_columns:
        not_null = df[column].notnull()
        if (~not_null).sum() == 0:
            continue
        knr = KNeighborsRegressor(**knr_params)
        knr.fit(df.loc[not_null,coordinate_columns], df.loc[not_null,[column]])
        predicted = knr.predict(df.loc[~not_null,coordinate_columns])
        df.loc[ (~not_null),[column]] = predicted
Example #28
0
def compute_mse(regressor, horizon):
    # get wind park and corresponding target. forecast is for the target
    # turbine
    park_id = NREL.park_id['tehachapi']
    windpark = NREL().get_windpark(park_id, 3, 2004, 2005)
    target = windpark.get_target()

    # use power mapping for pattern-label mapping. Feature window length
    # is 3 time steps and time horizon (forecast) is 3 time steps.
    feature_window = 3
    mapping = PowerMapping()
    X = mapping.get_features_park(windpark, feature_window, horizon)
    Y = mapping.get_labels_turbine(target, feature_window, horizon)

    # train roughly for the year 2004.
    train_to = int(math.floor(len(X) * 0.5))

    # test roughly for the year 2005.
    test_to = len(X)

    # train and test only every fifth pattern, for performance.
    train_step, test_step = 5, 5

    if(regressor == 'linear'):
        # fitting the pattern-label pairs
        reg = linear_model.LinearRegression()
        reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step])
        y_hat = reg.predict(X[train_to:test_to:test_step])
    elif(regressor == 'knn'):
        k_neighbors = 10
        reg = KNeighborsRegressor(k_neighbors, 'uniform')
        # fitting the pattern-label pairs
        reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step])
        y_hat = reg.predict(X[train_to:test_to:test_step])
    else:
        raise Exception("No regressor set.")

    # naive is also known as persistance model.
    naive_hat = zeros(len(y_hat), dtype = float32)
    for i in range(0, len(y_hat)):
        # naive label is the label as horizon time steps before.
        # we have to consider to use only the fifth label here, too.
        naive_hat[i] = Y[train_to + (i * test_step) - horizon]

    # computing the mean squared errors of Linear and naive prediction.
    mse_y_hat, mse_naive_hat = 0, 0
    for i in range(0, len(y_hat)):
        y = Y[train_to + (i * test_step)]
        mse_y_hat += (y_hat[i] - y) ** 2
        mse_naive_hat += (naive_hat[i] - y) ** 2

    mse_y_hat /= float(len(y_hat))
    mse_naive_hat /= float(len(y_hat))

    return mse_y_hat, mse_naive_hat
Example #29
0
 def knn(X, Y):
   neigh = KNeighborsRegressor()
   neigh.fit(X, Y)
   def explore(x):
     score = -1 * neigh.predict([x])
     return score
   minimized = differential_evolution(explore, ((0, 1), (0, 1), (0, 1), (0, 1), (0, 1)))
   return {
     'X_min': list(minimized.x),
     'score': neigh.score(X, Y)
   }
def kNN(X, Y=[], k=2, algorithm="brute", radius=0.65, filename='graph-output.pdf', do_regression=True ):
    graph = pydot.Dot(graph_type='digraph')
    knn_model = NearestNeighbors(n_neighbors=k, algorithm=algorithm)
    nbrs = knn_model.fit(X)
    print '-' * 80
    indices = nbrs.kneighbors(X, 2, return_distance=False)
    radius_indices = nbrs.radius_neighbors(X, radius, return_distance=False)
    k_mapping = zip( X, indices, radius_indices )
    print '-' * 80

    nodes, misses, hits = {}, [], []
    for i, kmap in enumerate(k_mapping):
        sample = kmap[0]
        nneigh = kmap[1]
        rneigh = kmap[2]
        ypred  = Y[nneigh[1]]
        ytrue  = Y[i]
        nodes[i] = pydot.Node(str(i))
        if ypred != ytrue:
            misses.append(i)
        else:
            hits.append(i)

        for j in nneigh:
            if j not in nodes:
                nodes[j] = pydot.Node(str(j))
            if i == j:
                color = "black"
                if ytrue != ypred: color = "red"
                label = "%s: %s" % ( ypred, ytrue )
                graph.add_edge(pydot.Edge(str(i), str(j), label=label, labelfontcolor="#009933", fontsize="7.0", color=color))
            if i != j:
                color = "black"
                if ypred != Y[j]: color = "red"
                label = "%s: %s" % ( ypred, Y[j] )
                graph.add_edge(pydot.Edge(str(i), str(j), label=label, labelfontcolor="#009933", fontsize="7.0", color=color))
        print "%s : %s... \n\t%s:%s \n\t%s \n\t%s" % ( i, sample[:10], ypred, ytrue, nneigh, rneigh ) 
    print '-' * 80
    graph.write_pdf(filename)
    print "[%s] misses: %s" % ( len(misses), misses )
    print "[%s] hits:   %s" % ( len(hits), hits )

    if do_regression:
        neigh = KNeighborsRegressor(n_neighbors=2)
        neigh.fit(X, Y) 
        yp = neigh.predict(X+X*random.random()*.1)
        m = len(yp)
        mse = sum([ (x-y)*(x-y) for (x,y) in zip( Y, yp ) if x-y ]) / float(m)
        print "REGRESSION mean squared error: ", mse
        print "REGRESSION mse(i)!=0: ", [ ("%s:" % i, "%.5f" % ((x-y)*(x-y)/float(m))) for (i,x,y) in zip( range(m), Y, yp ) if x-y ]


    return nbrs, yp
Example #31
0
 def k_nearest_neigbor(self, input, output):
     model = KNeighborsRegressor(n_neighbors=2, p=1)
     model.fit(input, output)
     return model
plt.ylabel('Actual Income')
plt.title('Multiple Linear Regression Results')
plt.show()

lr_coefs = pd.DataFrame({'col':list(features.columns), 'coef':lr.coef_, 'abs_coef':abs(lr.coef_)}).sort_values(by='abs_coef', ascending=False).reset_index(drop=True)
lr_coefs

from sklearn.neighbors import KNeighborsRegressor

k_opt = 0
max_score = 0
scores = []
print('Iteration: k = ', end='')
for k in range(1,21):
    print('{}, '.format(k), end='', flush=True)
    knr = KNeighborsRegressor(n_neighbors=k, weights='distance')
    knr.fit(train_features, train_targets)
    score = knr.score(test_features, test_targets)
    scores.append(score)
    if abs(score) > abs(max_score):
        k_opt = k
        max_score = score

fig, ax = plt.subplots(figsize=(10,7))
plt.plot(range(1,21), scores)
plt.xlabel('n_neighbors (k)')
plt.ylabel('R2 Score')
plt.title('K Neighbors Regressor')
ax.annotate('k={}, R2={}'.format(k_opt,max_score), (k_opt, scores[k_opt-1]))
plt.show()
Example #33
0
# In[11 ]:

#Decision Tree Regressor
reg = DecisionTreeRegressor().fit(X_train, y_train)
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print(tr_err, ts_err)


# In[ 12]:

#KNN
reg = KNeighborsRegressor()
params = {'kneighborsregressor__n_neighbors':[1, 5, 10, 20, 25]}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print(tr_err, ts_err)


# In[ 13]:
Example #34
0
# read in csv files
X_train = pd.read_csv('train.csv')
y_train = pd.read_csv('y_train.csv', names=['price'])

X_test = pd.read_csv('test.csv')
y_test = pd.read_csv('y_test.csv', names=['price'])

# some exploratory data analyses

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

#print(X_train.info())
#print(X_train.describe())
#print(X_train.head())

# preprocess data
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# fit Regressor to training data
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train, y_train)

# Make predictions
y_pred = knn_reg.predict(X_test)

# compute metrics

regression_results(y_test, y_pred)
Example #35
0
def stack_NN_model(X,
                   y,
                   nn_obj,
                   n,
                   mod,
                   cb=cb,
                   verbose=True,
                   crit="mse",
                   **kwargs):
    X = np.copy(X)
    y = np.copy(y)

    permute_indices = np.random.permutation(np.arange(len(y)))
    X = X[permute_indices]
    y = y[permute_indices]

    X = nn_obj.predict(X)

    #    nn_obj.split_and_scale(X, y, scaler="Standard")
    xtr, xte, ytr, yte = train_test_split(X, y, random_state=0)

    if nn_obj.err_type == "AVL": crit = "mae"
    else: crit = "mse"

    if mod == "RF":
        try:
            model = RandomForestRegressor(n_estimators=n, max_depth=kwargs["max_depth"],\
                                          criterion=crit)
        except KeyError:
            model = RandomForestRegressor(n_estimators=n, criterion=crit)

    else:
        model = KNeighborsRegressor(n_neighbors=n)

    model = model.fit(xtr, ytr.ravel())
    print("{} with {}: Score = {}".format(mod, n,
                                          model.score(xte, yte.ravel())))

    dev_lab = "{}_augmented_Pred_lr_{}_{}_{}_Maxepoch_{}"\
            .format(mod, nn_obj.lr, nn_obj.train_mod, nn_obj.activation,\
                    nn_obj.scaler, nn_obj.max_epoch)

    test_line = range(len(xte))
    mod_test_preds = model.predict(xte)

    deviation = np.array([abs(mod_test_preds[j] - yte[j]) for j in test_line])
    error_estimation = sum(deviation)

    if plt.fignum_exists("Stacking Comparaison sur le test set"):
        plt.figure("Stacking Comparaison sur le test set")
        plt.plot(test_line, mod_test_preds, label=dev_lab, marker='+',\
                    fillstyle='none', linestyle='none', c=nn_obj.kwargs["color"])

    else:
        plt.figure("Stacking Comparaison sur le test set")
        plt.plot(test_line, yte, label="Expected value", marker='o', fillstyle='none',\
                    linestyle='none', c='k')
        plt.plot(test_line, mod_test_preds, label=dev_lab, marker='+',\
                    fillstyle='none', linestyle='none', c=nn_obj.kwargs["color"])

    plt.legend(loc="best", prop={'size': 7})

    if plt.fignum_exists("Stacking Deviation of the prediction"):
        plt.figure("Stacking Deviation of the prediction")
        plt.plot(yte, mod_test_preds, c=nn_obj.kwargs["color"], marker='o',\
                 linestyle='none', label=dev_lab, ms=3)

    else:
        plt.figure("Stacking Deviation of the prediction")
        plt.plot(yte, yte, c='navy', marker='+', label="wanted value")
        plt.plot(yte, mod_test_preds, c=nn_obj.kwargs["color"], marker='o',\
                      linestyle='none', label=dev_lab, ms=3)

    plt.legend(loc="best", prop={'size': 7})

    print("Résultat après Stacking NN_%s Estimateur ou Voisins = %d" %
          (mod, n))
    #    print("Modèle pour H_NL = {}, H_NN = {} \n".format(len(N_.keys())-2, N_["N1"]))
    print("Fonction d'activation : {}\n Fonction de cout : {}\n\
    Méthode d'optimisation : {}".format(nn_obj.activation, nn_obj.err_type,
                                         nn_obj.train_mod))
    print("Moyenne de la somme des écart sur le test set = {}\n".format(
        error_estimation))

    plt.show()

    return model
Example #36
0
    best_radius = radii[np.argmin(mae_rnn)]

    fig, ax = plt.subplots()
    ax.set_title('Parameter evaluation for RNN')
    ax.set_xlabel('Radius')
    ax.set_ylabel('Mean absolute error')
    ax.set_xlim(low, high)
    ax.set_xticks(list(ax.get_xticks()) + [best_radius])
    ax.plot(radii, mae_rnn, c='orange', linewidth=2)
    fig.savefig('rnn_param.png')

    return best_radius


knn_regressor = KNeighborsRegressor(n_neighbors=get_best_knn_n_neighbors(
    1, 100),
                                    weights='distance')
knn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

rnn_regressor = RadiusNeighborsRegressor(radius=get_best_rnn_radius(
    1.7, 3.0, 0.05),
                                         weights='distance')
rnn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

lr_regressor = LinearRegression()
lr_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']])

energia_knn = knn_regressor.predict(test_df[['temperatura', 'vacuo']])
energia_rnn = rnn_regressor.predict(test_df[['temperatura', 'vacuo']])
energia_lr = lr_regressor.predict(test_df[['temperatura', 'vacuo']])
Example #37
0
# alg evaluation metrics
# Practice using the Accuracy and LogLoss metrics on a classification problem.
# Practice generating a confusion matrix and a classification report
# Practice using RMSE and RSquared metrics on a regression problem.
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))

# ˆ Spot-check linear algorithms on a dataset (e.g. linear regression, logistic regression and
# linear discriminate analysis).
# Spot-check some nonlinear algorithms on a dataset (e.g. KNN, SVM and CART).
# ˆ Spot-check some sophisticated ensemble algorithms on a dataset (e.g. random forest and
# stochastic gradient boosting).
# KNN Regression
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("nonlinear regression check " +
      str(results.mean()))  # Spot-Check a Nonlinear Regression Algorithm.

# how to check more models at once
# prepare models
models = []
models.append(('LR', LogisticRegression()))  # ??
models.append(('LDA', LinearDiscriminantAnalysis()))  # ??
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
Example #38
0
                           nan_flag=[],
                           zero=[
                               'crim', 'zn', 'nox', 'indus', 'rm', 'age',
                               'tax', 'ptratio', 'b', 'dis'
                           ])
     },
     'score': 5.5088106991425985,
     'std': 0.293662905734789
 },
 'KNeighborsRegressor': {
     'params': {
         'predictor':
         KNeighborsRegressor(algorithm='auto',
                             leaf_size=30,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs=1,
                             n_neighbors=7,
                             p=2,
                             weights='uniform'),
         'scaler':
         RobustScaler(copy=True,
                      quantile_range=(25.0, 75.0),
                      with_centering=True,
                      with_scaling=True),
         'simple_imputer':
         FillNaTransformer(from_dict={},
                           mean=[
                               'crim', 'zn', 'nox', 'indus', 'rm', 'age',
                               'tax', 'ptratio', 'b', 'dis'
                           ],
                           median=[],
Example #39
0
    def train(self):
        if self.config['method'] == 'regression':
            print('Building regression model')
            print('Fetching data')
            self.get_df_reg()
            print('Data Fetched')
            print('Splitting data')
            df_x = self.df_reg.iloc[:, 3:]
            df_y = self.df_reg.iloc[:, 1]
            x_train, x_test, y_train, y_test = train_test_split(df_x,
                                                                df_y,
                                                                test_size=0.2,
                                                                random_state=1)
            print('Data splitted')
            print('Size of x_train', x_train.shape)
            print('Size of y_train', y_train.shape)
            print('Size of x_test', x_test.shape)
            print('Size of y_test', y_test.shape)

            if self.config['model'] == 'svr':
                print('Support vector regressor')
                model = SVR(kernel=self.config['svr_kernel'])
            if self.config['model'] == 'knr':
                print('K-nearest neighbors regressor')
                model = KNeighborsRegressor(n_jobs=12)
            if self.config['model'] == 'dtr':
                print('Decision tree regressor')
                model = DecisionTreeRegressor()
            if self.config['model'] == 'rf':
                print('Random forest regressor')
                model = RandomForestRegressor(n_jobs=12)
            if self.config['model'] == 'et':
                print('Extra trees regressor')
                model = ExtraTreesRegressor(n_jobs=12)
            if self.config['model'] == 'gbr':
                print('Gradient boosting regressor')
                model = GradientBoostingRegressor()

            try:
                model
            except BaseException:
                print('Invalid model configuration. Check config.ini')
                return

            model.fit(x_train, y_train)
            pred = pd.Series(model.predict(df_x))
            self.df_reg.insert(2, 'Predicted_current', pred)
            print('R^2 score', model.score(x_test, y_test))

            print('Converting to binary classification')
            y_test_list, y_pred_list, _, _ = self.to_bin_cl(
                x_test, y_test, model)
            _, _, bin_y, bin_y_pred = self.to_bin_cl(df_x, df_y, model)
            conf_mat = confusion_matrix(y_true=y_test_list, y_pred=y_pred_list)
            print('Converted to binary classification')

            self.df_reg.insert(3, 'Actual_class', bin_y)
            self.df_reg.insert(4, 'Predicted_class', bin_y_pred)

            print('Confusion matrix:\n', conf_mat)
            p = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[1][0])
            r = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[0][1])
            print(
                'Accuracy is',
                np.sum(np.array(y_test_list) == y_pred_list) /
                len(y_pred_list))
            print('Precision is', p)
            print('Recall is', r)
            print('F1-score is', self.get_f_score(p, r, 1))
            print('F0.5-score is', self.get_f_score(p, r, 0.5))
            print('F2-score is', self.get_f_score(p, r, 2))

            # joblib.dump(model,'models/'+self.config['model']+'.model')
            self.save_result()
Example #40
0
 def fit(self, X_train, y_train, X_val, y_val):
     model = KNeighborsRegressor(n_neighbors=35)
     model.fit(X_train, y_train)
     self.model = model
corr_matrix = df.corr()
corr_matrix['MEDV']

sns.heatmap(corr_matrix);
plt.show()

print(boston['DESCR'])

dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]

X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42)
y_train = y_train.values.ravel()

models = []
models.append(('SVR', SVR()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('l', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('R', Ridge()))
models.append(('BR', BayesianRidge()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('RF', AdaBoostRegressor()))
models.append(('ET', ExtraTreesRegressor()))
models.append(('BgR', BaggingRegressor()))

scoring = 'neg_mean_squared_error'

results = []
names = []
Example #42
0
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.354881802745745
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=92),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=100,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
Example #43
0
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None):
    '''Select estimator and parameters from argument name.'''
    # Regressors
    if estimator == 'RandomForestRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = RandomForestRegressor(n_jobs=n_jobs,
                                          n_estimators=n_estimators,
                                          random_state=random_state)
    elif estimator == 'ExtraTreesRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = ExtraTreesRegressor(n_jobs=n_jobs,
                                        n_estimators=n_estimators,
                                        random_state=random_state)
    elif estimator == 'GradientBoostingRegressor':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingRegressor(n_estimators=n_estimators,
                                              random_state=random_state)
    elif estimator == 'SVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='rbf')
    elif estimator == 'LinearSVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='linear')
    elif estimator == 'Ridge':
        param_dist = parameters['linear']
        estimator = Ridge(solver='auto', random_state=random_state)
    elif estimator == 'Lasso':
        param_dist = parameters['linear']
        estimator = Lasso(random_state=random_state)
    elif estimator == 'ElasticNet':
        param_dist = parameters['linear']
        estimator = ElasticNet(random_state=random_state)
    elif estimator == 'KNeighborsRegressor':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsRegressor(algorithm='auto')

    # Classifiers
    elif estimator == 'RandomForestClassifier':
        param_dist = {
            **parameters['ensemble'],
            **parameters['bootstrap'],
            **parameters['criterion']
        }
        estimator = RandomForestClassifier(n_jobs=n_jobs,
                                           n_estimators=n_estimators,
                                           random_state=random_state)
    elif estimator == 'ExtraTreesClassifier':
        param_dist = {
            **parameters['ensemble'],
            **parameters['bootstrap'],
            **parameters['criterion']
        }
        estimator = ExtraTreesClassifier(n_jobs=n_jobs,
                                         n_estimators=n_estimators,
                                         random_state=random_state)
    elif estimator == 'GradientBoostingClassifier':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingClassifier(n_estimators=n_estimators,
                                               random_state=random_state)
    elif estimator == 'LinearSVC':
        param_dist = parameters['linear_svm']
        estimator = LinearSVC(random_state=random_state)
    elif estimator == 'SVC':
        param_dist = parameters['svm']
        estimator = SVC(kernel='rbf', random_state=random_state)
    elif estimator == 'KNeighborsClassifier':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsClassifier(algorithm='auto')

    return param_dist, estimator
Example #44
0
for i in range(0, 32):
    data['file_name'] = data['file_name'].replace('File_' + str(i), i)

data.drop('time', 1, inplace=True)

#=================================================================
# using cross val predict function

Feature = data[['week', 'day_of_week', 'start_time', 'work_flow', 'file_name']]
Result = data['size']

#=========using the best parameter we found

# using best parameter, RMSE is: 0.0129735729554

knn = KNeighborsRegressor(n_neighbors=2, p=1, weights='distance')

knn.fit(Feature, Result)

predicted_target = cross_val_predict(knn, Feature, Result, cv=10)

print 'using best parameter, RMSE is: '
print sp.sqrt(mean_squared_error(predicted_target, Result))

fig, ax = plt.subplots()
ax.scatter(range(0, len(Result)), Result, c='b', s=8, label='true value')
ax.scatter(range(0, len(predicted_target)),
           predicted_target,
           c='r',
           s=8,
           label='fitted value')
Example #45
0
    row.DecisionTreeMSE = metrics.mean_squared_error(y_test, y_pred)

    ## Random Forest Tree
    regr = RandomForestRegressor(max_depth=2)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    row.RandomForestMSE = metrics.mean_squared_error(y_test, y_pred)

    ### Boosting
    params = {'n_estimators': 100, 'max_depth': 2}
    clf = GradientBoostingRegressor(**params)
    clf.fit(X_train, y_train)
    row.BoostingMSE = metrics.mean_squared_error(y_test, y_pred)

    ### KNN
    neigh = KNeighborsRegressor(n_neighbors=3)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    row.KNeighbourMSE = metrics.mean_squared_error(y_test, y_pred)

    ### SVR
    svr = SVR(gamma='auto')
    svr = svr.fit(X_train, y_train.values.ravel())
    y_pred = svr.predict(X_test)
    row.SVR_MSE = metrics.mean_squared_error(y_test, y_pred)

    result = result.append(row.toDict(), ignore_index=True)

result

# %% [markdown]
Example #46
0
# MAGIC %md Read the dataset using the `fetch_california_housing` function and then split it into train and test using the `train_test_split` function. 

# COMMAND ----------

dataset = fetch_california_housing()
X_full, y_full = dataset.data, dataset.target
X_train, X_test, y_train, y_test=train_test_split(X_full,y_full,test_size=0.2, random_state=20)

# COMMAND ----------

# MAGIC %md Here we use a k-nearest neighbors regressor as part of a pipeline that includes scaling, and for the purposes of comparison, a knn regressor trained on the unscaled data has been provided in the following code cell. 

# COMMAND ----------

steps=[('scaler', StandardScaler()),
       ('knn',    KNeighborsRegressor())]

pipeline=Pipeline(steps)

# COMMAND ----------

# MAGIC %md Fit the pipeline using `X_train` as training data and `y_train` as target values, and pass the computed parameters to an object `knn_scaled`. Also, fit a knn regressor using unscaled training data and pass the computed parameters to the object `knn_unscaled`.

# COMMAND ----------

knn_scaled = pipeline.fit(X_train, y_train)
knn_unscaled = KNeighborsRegressor().fit(X_train, y_train)

# COMMAND ----------

# MAGIC %md Compute and print metrics.
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]

n_pixels = data.shape[1]
X_train = train[:, :np.ceil(0.5 * n_pixels)]  # Upper half of the faces
y_train = train[:, np.floor(0.5 * n_pixels):]  # Lower half of the faces
X_test = test[:, :np.ceil(0.5 * n_pixels)]
y_test = test[:, np.floor(0.5 * n_pixels):]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict[name] = estimator.predict(X_test)

# Plot the completed faces
image_shape = (64, 64)

n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators", size=16)
Example #48
0
def nnr(datapath):
    # load mat
    datafile = os.path.join(datapath, 'data_numpy.mat')
    if os.path.exists(datafile) is False:
        print('Data file %s not found.' % datafile)

    data_numpy = sio.loadmat(datafile)
    # get training and test data
    train_x_raw = data_numpy['trainX_raw']
    train_x_smooth = data_numpy['trainX_smooth']
    train_y = data_numpy['trainY']
    test_x_raw = data_numpy['testX_raw']
    test_x_smooth = data_numpy['testX_smooth']
    test_y = data_numpy['testY']
    base_y = data_numpy['baseY']

    train_y = train_y.ravel()

    t_start = time.perf_counter()
    x_fft = np.fft.fft(train_x_raw)
    raw_fft_time = time.perf_counter() - t_start
    train_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1)
    x_fft = np.fft.fft(test_x_raw)
    test_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1)

    t_start = time.perf_counter()
    x_fft = np.fft.fft(train_x_smooth)
    smooth_fft_time = time.perf_counter() - t_start
    train_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)),
                                        axis=1)
    x_fft = np.fft.fft(test_x_smooth)
    test_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)),
                                       axis=1)

    # NNR on raw data stream
    neighbor_num = 10
    nnr_raw = KNeighborsRegressor(n_neighbors=neighbor_num, weights='distance')
    t_start = time.perf_counter()
    nnr_raw.fit(train_x_raw, train_y)
    nnr_raw_time = time.perf_counter() - t_start
    pred_y = nnr_raw.predict(test_x_raw)
    np.savetxt(os.path.join(datapath, 'nnr_raw.txt'), pred_y)

    nnr_raw_fft = KNeighborsRegressor(n_neighbors=neighbor_num,
                                      weights='distance')
    t_start = time.perf_counter()
    nnr_raw_fft.fit(train_x_raw_fft, train_y)
    nnr_raw_fft_time = time.perf_counter() - t_start
    pred_y = nnr_raw_fft.predict(test_x_raw_fft)
    np.savetxt(os.path.join(datapath, 'nnr_raw_fft.txt'), pred_y)

    nnr_smooth = KNeighborsRegressor(n_neighbors=neighbor_num,
                                     weights='distance')
    t_start = time.perf_counter()
    nnr_smooth.fit(train_x_smooth, train_y)
    nnr_smooth_time = time.perf_counter() - t_start
    pred_y = nnr_smooth.predict(test_x_smooth)
    np.savetxt(os.path.join(datapath, 'nnr_smooth.txt'), pred_y)

    nnr_smooth_fft = KNeighborsRegressor(n_neighbors=neighbor_num,
                                         weights='distance')
    t_start = time.perf_counter()
    nnr_smooth_fft.fit(train_x_smooth_fft, train_y)
    nnr_smooth_fft_time = time.perf_counter() - t_start
    pred_y = nnr_smooth_fft.predict(test_x_smooth_fft)
    np.savetxt(os.path.join(datapath, 'nnr_smooth_fft.txt'), pred_y)

    f_time = open(os.path.join(datapath, 'nnr_time.txt'), 'w')
    f_time.write(str(raw_fft_time) + '\n')
    f_time.write(str(smooth_fft_time) + '\n')
    f_time.write(str(nnr_raw_time) + '\n')
    f_time.write(str(nnr_raw_fft_time) + '\n')
    f_time.write(str(nnr_smooth_time) + '\n')
    f_time.write(str(nnr_smooth_fft_time) + '\n')
    f_time.close()
                #prepare ypred for writing out to a file
                #yprep_pd = pd.DataFrame(ypred)
                       
                #ypred_pd.columns = gg
                #ypred_pd.index = test_ids
                #ypred_frame_svr = pd.concat([ypred_frame_svr, ypred_pd], axis=1, sort=True)
                       
                       
                  
            #K Nearest Neighbour
            gnlist = list(knn_grid['Gene_Name'])
            f = gene_name in gnlist
            if f != False: #Just to be sure that the gene exist in the KNN best grid dataframe
                k = knn_grid[knn_grid['Gene_Name']==gene_name].iloc[0,3]
                weight = knn_grid[knn_grid['Gene_Name']==gene_name].iloc[0,4]
                knn = KNeighborsRegressor(n_neighbors=k, weights = weight)
                knn.fit(cis_gt, adj_exp.ravel())
                ypred = knn.predict(test_cis_gt)

                #write out ypred quickly
                open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+"_knn.txt", "a").write("\n")
                open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+"_knn.txt", "a").write(str(gene))
                for j in range(len(ypred)):
                     open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+
                          "_knn.txt", "a").write("\t"+str(ypred[j]))
                       
                #prepare ypred for writing out to a file
                #yprep_pd = pd.DataFrame(ypred)
                       
                #ypred_pd.columns = gg
                #ypred_pd.index = test_ids
Example #50
0
                                                  "Dataset3/4/(b)/Neural",
                                                  verbose=False)
            if RMSE_test_cur < RMSE_test_best:
                RMSE_test_best = RMSE_test_cur
                a_best = a
                h_best = h
    print()
    model_neural = MLPRegressor((h_best, ), activation=a_best)
    print_performance_log(model_neural, X_1b, y, "Dataset3/4/(b)/Neural")
    print("Best activation: {}".format(a_best))
    print("Best number of hidden layers: {}".format(h_best))

    neighbors = range(1, 51)
    n_best = None
    RMSE_test_best = float('inf')
    for i, n in enumerate(neighbors):
        progressBar(i + 1, len(neighbors))
        model_knn = KNeighborsRegressor(n_neighbors=n)
        RMSE_test_cur = print_performance(model_knn,
                                          X_1b,
                                          y,
                                          "Dataset3/4/(b)/KNN",
                                          verbose=False)
        if RMSE_test_cur < RMSE_test_best:
            RMSE_test_best = RMSE_test_cur
            n_best = n
    print()
    model_knn = KNeighborsRegressor(n_neighbors=n_best)
    print_performance(model_knn, X_1b, y, "Dataset3/4/(b)/KNN")
    print("Best number of neighbors: {}".format(n_best))
Example #51
0
        'neighbourhood_group', 'neighbourhood', 'room_type',
        'availability_365', 'vacancy'
    ],
                     axis=1)
    df = df[np.abs(df.price - df.price.mean()) <= (3 * df.price.std())]
    x = df.drop('price', axis=1)
    y = df.price
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=10)

    #%% applying regression models
    # define models
    lasso = Lasso(alpha=0.5)
    knn1 = KNeighborsRegressor()
    lr = LinearRegression()
    svr = SVR(kernel='linear', gamma='auto')
    # fiting models
    lasso.fit(x_train, y_train)
    svr.fit(x_train, y_train)
    lr.fit(x_train, y_train)
    knn1.fit(x_train, y_train)

    #%% model evaluation
    mae_ln = mean_absolute_error(y_train, lr.predict(x_train))
    mae_knn1 = mean_absolute_error(y_train, knn1.predict(x_train))
    mae_svr = mean_absolute_error(y_train, svr.predict(x_train))
    mae_lasso = mean_absolute_error(y_train, lasso.predict(x_train))
    print(" training mae = ", mae_ln, mae_lasso, mae_svr, mae_knn1)
    #%% on testing data
Example #52
0
# z = np.polyfit(x, y, deg=3)
# p = np.poly1d(z)

# ## Plot
# xp = np.linspace(-2, 6, 100)
# plt.figure(figsize=(6.5,4))
# plt.plot(x,y,'o',label='data')
# plt.plot(xp, p(xp),label='polyfit')
# plt.show()

#X = [[0], [1], [2], [3]]
#y = [0, 0, 1, 1]
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import scipy

x = np.array([[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]])
y = np.array([0.0, 0.8, 0.9, 0.1, -0.8, -1.0])
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(x, y)

print(neigh.predict([[3.5]]))

## Plot
xp = np.linspace(-2, 6, 100).reshape(-1, 1)
plt.figure(figsize=(6.5, 4))
plt.plot(x, y, 'o', label='data')
plt.plot(xp, neigh.predict(xp), label='nearest neighbor')
#plt.plot(xp, p(xp),label='polyfit')
plt.show()
    # Set the clf to the best combination of parameters
    SVM_model_best = Random_obj.best_estimator_
    
    # Fit the best algorithm to the data. 
    SVM_model_best.fit(X_train, Y_train)
    SVM_model_score = cross_val_score(estimator = SVM_model_best, X = X_train, y = Y_train, cv = 10,
                                        scoring='neg_mean_squared_error')
    SVM_model_score = (np.sqrt(np.abs(SVM_model_score)))
    
    SVM_model_score_mean = SVM_model_score.mean()
    SVM_model_score_std  = SVM_model_score.std()
  

  #.6.KNN
    KNN_model=KNeighborsRegressor() 
    KNN_model.fit(X_train, Y_train)
    KNN_model_score = cross_val_score(estimator = KNN_model, X = X_train, y = Y_train, cv = 10,
                                        scoring='neg_mean_squared_error')
    KNN_model_score = (np.sqrt(np.abs(KNN_model_score)))
    
    KNN_model_score_mean = KNN_model_score.mean()
    KNN_model_score_std  = KNN_model_score.std()



    # Choose some parameter combinations to try
    parameters = { 'n_neighbors': np.arange(1, 31, 1),
	              'metric': ["minkowski"]
                 }
    
Example #54
0
    n_estimators=500,
    random_state=12,
    **{
        'max_depth': 5,
        'num_leaves': 60,
        'feature_fraction': '0.8',
        'bagging_fraction': '0.92'
    })
# level0_models['KNN_rougher_a'] = make_pipeline(scaler,KNeighborsRegressor(n_jobs = -1,**{'n_neighbors': 254, 'weights': 'distance', 'leaf_size': 16}))
scaler = make_pipeline(QuantileTransformer(output_distribution='normal'),
                       PCA(whiten=True))
level0_models_rougher['KNN_rougher_b'] = make_pipeline(
    scaler,
    KNeighborsRegressor(n_jobs=-1,
                        **{
                            'n_neighbors': 50,
                            'weights': 'distance',
                            'leaf_size': 18
                        }))
level0_models_rougher['KNN_rougher_c'] = make_pipeline(
    scaler,
    KNeighborsRegressor(n_jobs=-1,
                        **{
                            'n_neighbors': 15,
                            'weights': 'distance',
                            'leaf_size': 30.0
                        }))
level0_models_rougher['KNN_rougher_d'] = make_pipeline(
    scaler,
    KNeighborsRegressor(n_jobs=-1,
                        **{
                            'n_neighbors': 5,
Example #55
0
from sklearn.neighbors import KNeighborsRegressor

from src.config import PP_DICT, FULL_DATA_DICT
from src.model.utils import train_test_model

params = {'n_neighbors': 46, 'p': 1, 'weights': 'distance'}
pp_dict = PP_DICT
data_dict = FULL_DATA_DICT

pipeline, m_err, r2 = train_test_model(KNeighborsRegressor(),
                                       params,
                                       data_dict,
                                       pp_dict,
                                       save_model=True)
# same result with .iloc and .loc
dc_listings = dc_listings.iloc[numpy.random.permutation(len(dc_listings))]
split_one = dc_listings[0:1862]
split_two = dc_listings[1862:]

## 2. Holdout Validation ##

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one

knn = KNeighborsRegressor(n_neighbors=5, algorithm='auto')

## here X must be of shape [n_samples, n_features]
## train_one['accommodates'] is a Series, and has the wierd shape [1862, ]
## train_one[['accommodates']] is a DataFrame, and has the correct shape [1862, 1]
knn.fit(train_one[['accommodates']], train_one['price'])
prediction = knn.predict(test_one[['accommodates']])
iteration_one_mse = mean_squared_error(prediction, test_one['price'])
iteration_one_rmse = iteration_one_mse**(1 / 2)

knn.fit(train_two[['accommodates']], train_two['price'])
prediction = knn.predict(test_two[['accommodates']])
iteration_two_mse = mean_squared_error(prediction, test_two['price'])
iteration_two_rmse = iteration_two_mse**(1 / 2)

avg_rmse = numpy.mean([iteration_one_rmse, iteration_two_rmse])
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

# Spot - Check Algorithm
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
Example #58
0
best_leaf_size = gs.best_params_['leaf_size']
best_weights = gs.best_params_['weights']
best_p = gs.best_params_['p']

outF = open("output.txt", "w")
print('best_algorithm = ', best_algorithm, file=outF)
print('best_n_neighbors = ', best_n_neighbors, file=outF)
print('best_leaf_size = ', best_leaf_size, file=outF)
print('best_weights = ', best_weights, file=outF)
print('best_p = ', best_p, file=outF)
print('R2 score is {}'.format(test_score_r2))
outF.close()

kn = KNeighborsRegressor(n_neighbors=best_n_neighbors,
                         algorithm=best_algorithm,
                         leaf_size=best_leaf_size,
                         weights=best_weights,
                         p=best_p)

t0 = time.time()
kn.fit(x_train, y_train.ravel())
kn_fit = time.time() - t0
print("kNN complexity and bandwidth selected and model fitted in %.6f s" %
      kn_fit)

t0 = time.time()
y_kn = kn.predict(x_test)
kn_predict = time.time() - t0
print("kNN prediction for %d inputs in %.6f s" % (x_test.shape[0], kn_predict))

# open a file to append
Example #59
0
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]

n_pixels = data.shape[1]
X_train = train[:, :int(np.ceil(0.5 * n_pixels))]  # Upper half of the faces
y_train = train[:, int(np.floor(0.5 * n_pixels)):]  # Lower half of the faces

X_test = test[:, :int(np.ceil(0.5 * n_pixels))]
y_test = test[:, int(np.floor(0.5 * n_pixels)):]

# Fit estimators
ESTIMATORS = {
    "Extra trees":
    ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0),
    "K-nn":
    KNeighborsRegressor(),
    "Linear regression":
    LinearRegression(),
    "Ridge":
    RidgeCV(),
    "Lasso":
    Lasso(),

    #   "ElasticNet_0.5": ElasticNet(alpha=100000, l1_ratio=0.001),

    #    "ElasticNet_0.1" : ElasticNet(alpha=0.0001, l1_ratio=0.01),
}

y_test_predict = dict()
r2_scores = dict()
Example #60
0
from numpy import load, save, zeros, nan_to_num
from sklearn.neighbors import KNeighborsRegressor
from open3d import read_point_cloud

vox1 = read_point_cloud('vox1.ply')
neigh = KNeighborsRegressor(1, n_jobs=-1)

pca = nan_to_num(load('pca_1.npy'))
pca_concat = zeros((len(pca), 9))
pca_concat[:, :3] = pca

vox = read_point_cloud('vox2.ply')
pca = nan_to_num(load('pca_2.npy'))
neigh.fit(vox.points, pca)
pca_concat[:, 3:6] = neigh.predict(vox1.points)

vox = read_point_cloud('vox4.ply')
pca = nan_to_num(load('pca_4.npy'))
neigh.fit(vox.points, pca)
pca_concat[:, 6:] = neigh.predict(vox1.points)

save('pca', pca_concat)