def _predict(self, Y_train): sanitycheck(Y_train, np.ndarray) if (not len(self.neighbors_idx)) or (not len(self.neighbors_dist)): raise ValueError("You need to call the \"fit\" method before!\n") # check for single or multiple output value if (len(Y_train.shape) == 1): length = 1 elif (len(Y_train.shape) == 2): length = Y_train.shape[1] else: raise ValueError( "Output vector must have the following shapes:\n [n_samples,], shape=(k,)\n or \n [n_samples, n_input_features] : shape=(k,l)\n" ) self.prediction = np.zeros([len(self.neighbors_idx), length]) if self.__crit == "flat": for i, j in enumerate(self.neighbors_idx): self.prediction[i] = Y_train[j].mean(axis=0) elif self.__crit == "weighted": for i, j in enumerate(zip(self.neighbors_idx, self.neighbors_dist)): self.prediction[i] = np.average(Y_train[j[0]], axis=0, weights=1. / j[1])
def _fit(self, X_train, X_test): sanitycheck(X_train, np.ndarray) sanitycheck(X_test, np.ndarray) #TO_ADD: session fro pd.Series and pd.DataFrame conversion if len(X_train.shape) == 1 and len(X_test.shape) == 1: X = np.column_stack((X_train, np.zeros(len(X_train)))) Y = np.column_stack((X_test, np.zeros(len(X_test)))) elif (X_train.shape[1] != X_test.shape[1]): raise ValueError( "X_train and X_test must have the same number of input features!\n" ) else: X = X_train Y = X_test if self.method == "classic": return self.fit_serial(X, Y) elif self.method == "grid": return self.fit_grid(X, Y, self.__grid_size) elif self.method == "kd-tree": return self.fit_kdtree(X, Y, self.__leafsize) elif self.__paral and self.method == "kd-tree": return self.fit_parallel(X, Y)
def L1(X): """ Compute a collection of L1 norms for a ndarray Parameters ---------- X : numpy-like, shape = [n_data,n_features] """ sanitycheck(X,np.ndarray) return np.sqrt(np.power(X, 2)).sum(axis=1)
def _cross_val(self, X, Y, learner): #Initialization: check correctness of data format sanitycheck(X, np.ndarray) sanitycheck(Y, np.ndarray) learning = learner.learning_type if learning == 'instance_based': idx = 0 elif learning == 'training_based': idx = 1 else: raise ValueError( "Only two possible learning types are admitted: instance_based and training_based" ) #Preparing the folds ndata = X.shape[0] if self.first_folding: self.index = np.arange(ndata, dtype=int) if self.shuf: np.random.shuffle(self.index) self.first_folding = False #You do not want to reshuffle when a hyper parameter changes batch_length = ndata // self.nfolds #Checking that the learner is either a regressor or a classifier learning = learner.learner_type if (learning == 'regressor'): return self._cross_val_regress(X, Y, learner, batch_length, idx) elif (learning == 'classifier'): return self._cross_val_class(X, Y, learner) else: raise ValueError( 'The learner has to be either a regressor either a classifier!' )
def _fit_grid(self, X_train, X_test, extensions): sanitycheck(X_train, np.ndarray) sanitycheck(X_test, np.ndarray) sanitycheck(extensions, np.ndarray) WARN = False if ((len(X_train[0]) != 2) and (len(X_train[-1]) != 2) and (len(X_test[1]) != 2) and (len(X_test[-2]) != 2)): raise ValueError( "Grid methods actually works only for 2-d isotropic lattices!") X_train_lex = self._lexico(X_train, extensions) self.neighbors_idx = np.zeros([X_test.shape[0], self.__k], dtype=int) self.neighbors_dist = np.zeros([X_test.shape[0], self.__k]) nk_temp = np.zeros(self.__k, int) nk_dist_temp = np.zeros(self.__k) m = 0 for i in X_test: neib = self.__k shell = 1 l = 0 while neib > 0: points, dist = self._shell_neighbor(shell, i) for j, k in zip(self._lexico(points, extensions), dist): where = np.argwhere(X_train_lex == j) if (len(where) and neib > 0): nk_temp[l] = where nk_dist_temp[l] = k l += 1 neib -= 1 shell += 1 if shell == 4: WARN = True self.neighbors_idx[m] = nk_temp self.neighbors_dist[m] = nk_dist_temp m += 1 if WARN: print( "WARNING: grid method becomes not exact: exceeding the 3rd shell results in approximate definition of neighbors!" )
def _my_kdtree(self, data, leafsize): """ Build a kd-tree for O(n log n) nearest neighbour search. It is employed when the option "kd_tree" is applied. It is really helpful when the number of missing instances is large, and the serial knn, or the exact grid method fail. Parameters ---------- data: numpy-array-like, shape = [ndata,ndim] array containing training instances features. leafsize: int type max. number of data points to leave in a leaf (advisable to use always at least 5*k_neighbors) Returns ------- kd-tree: list of tuples. list of tuples representing tree nodes (hyperrectangles) and leaves. """ sanitycheck(data, np.ndarray) ndim = data.shape[1] ndata = data.shape[0] # find upper and lower bound in feature space hrect = np.zeros((2, ndim)) hrect[0, :] = data.min(axis=0) hrect[1, :] = data.max(axis=0) # create the root of kd-tree idx = np.argsort(data[:, 0]) data = data[idx] splitval = data[ndata // 2, 0] left_hrect = hrect.copy() right_hrect = hrect.copy() left_hrect[1, 0] = splitval right_hrect[0, 0] = splitval tree = [(None, None, left_hrect, right_hrect, None, None)] stack = [(data[:ndata // 2, :], idx[:ndata // 2], 1, 0, True), (data[ndata // 2:, :], idx[ndata // 2:], 1, 0, False)] # recursively split data in halves using hyper-rectangles: while stack: # pop data off stack data, didx, depth, parent, leftbranch = stack.pop() ndata = data.shape[0] nodeptr = len(tree) # update parent node _didx, _data, _left_hrect, _right_hrect, left, right = tree[parent] tree[parent] = (_didx, _data, _left_hrect, _right_hrect, nodeptr, right) if leftbranch \ else (_didx, _data, _left_hrect, _right_hrect, left, nodeptr) # insert node in kd-tree # leaf node? if ndata <= leafsize: _didx = didx.copy() _data = data.copy() leaf = (_didx, _data, None, None, 0, 0) tree.append(leaf) # not a leaf, split the data in two else: splitdim = depth % ndim idx = np.argsort(data[:, splitdim]) data = data[idx] didx = didx[idx] nodeptr = len(tree) stack.append((data[:ndata // 2, :], didx[:ndata // 2], depth + 1, nodeptr, True)) stack.append((data[ndata // 2:, :], didx[ndata // 2:], depth + 1, nodeptr, False)) splitval = data[ndata // 2, splitdim] if leftbranch: left_hrect = _left_hrect.copy() right_hrect = _left_hrect.copy() else: left_hrect = _right_hrect.copy() right_hrect = _right_hrect.copy() left_hrect[1, splitdim] = splitval right_hrect[0, splitdim] = splitval # append node to tree tree.append((None, None, left_hrect, right_hrect, None, None)) return tree