Ejemplo n.º 1
0
    def _predict(self, Y_train):

        sanitycheck(Y_train, np.ndarray)

        if (not len(self.neighbors_idx)) or (not len(self.neighbors_dist)):
            raise ValueError("You need to call the \"fit\" method before!\n")

        # check for single or multiple output value
        if (len(Y_train.shape) == 1):
            length = 1
        elif (len(Y_train.shape) == 2):
            length = Y_train.shape[1]
        else:
            raise ValueError(
                "Output vector must have the following shapes:\n [n_samples,], shape=(k,)\n or \n [n_samples, n_input_features] : shape=(k,l)\n"
            )
        self.prediction = np.zeros([len(self.neighbors_idx), length])

        if self.__crit == "flat":
            for i, j in enumerate(self.neighbors_idx):
                self.prediction[i] = Y_train[j].mean(axis=0)
        elif self.__crit == "weighted":
            for i, j in enumerate(zip(self.neighbors_idx,
                                      self.neighbors_dist)):
                self.prediction[i] = np.average(Y_train[j[0]],
                                                axis=0,
                                                weights=1. / j[1])
Ejemplo n.º 2
0
Archivo: myknn.py Proyecto: pretidav/ML
    def _fit(self, X_train, X_test):

        sanitycheck(X_train, np.ndarray)
        sanitycheck(X_test, np.ndarray)

        #TO_ADD: session fro pd.Series and pd.DataFrame conversion

        if len(X_train.shape) == 1 and len(X_test.shape) == 1:
            X = np.column_stack((X_train, np.zeros(len(X_train))))
            Y = np.column_stack((X_test, np.zeros(len(X_test))))
        elif (X_train.shape[1] != X_test.shape[1]):
            raise ValueError(
                "X_train and X_test must have the same number of input features!\n"
            )
        else:
            X = X_train
            Y = X_test

        if self.method == "classic":
            return self.fit_serial(X, Y)
        elif self.method == "grid":
            return self.fit_grid(X, Y, self.__grid_size)
        elif self.method == "kd-tree":
            return self.fit_kdtree(X, Y, self.__leafsize)
        elif self.__paral and self.method == "kd-tree":
            return self.fit_parallel(X, Y)
Ejemplo n.º 3
0
def L1(X):
    
    """
    Compute a collection of L1 norms for a ndarray
    Parameters
    ----------
    X : numpy-like, shape = [n_data,n_features]
    """
    
    sanitycheck(X,np.ndarray)
    return np.sqrt(np.power(X, 2)).sum(axis=1)
Ejemplo n.º 4
0
    def _cross_val(self, X, Y, learner):
        #Initialization: check correctness of data format
        sanitycheck(X, np.ndarray)
        sanitycheck(Y, np.ndarray)

        learning = learner.learning_type
        if learning == 'instance_based':
            idx = 0
        elif learning == 'training_based':
            idx = 1
        else:
            raise ValueError(
                "Only two possible learning types are admitted: instance_based and training_based"
            )

        #Preparing the folds
        ndata = X.shape[0]

        if self.first_folding:
            self.index = np.arange(ndata, dtype=int)
            if self.shuf: np.random.shuffle(self.index)
            self.first_folding = False  #You do not want to reshuffle when a hyper parameter changes

        batch_length = ndata // self.nfolds

        #Checking that the learner is either a regressor or a classifier
        learning = learner.learner_type

        if (learning == 'regressor'):
            return self._cross_val_regress(X, Y, learner, batch_length, idx)
        elif (learning == 'classifier'):
            return self._cross_val_class(X, Y, learner)
        else:
            raise ValueError(
                'The learner has to be either a regressor either a classifier!'
            )
Ejemplo n.º 5
0
Archivo: myknn.py Proyecto: pretidav/ML
    def _fit_grid(self, X_train, X_test, extensions):

        sanitycheck(X_train, np.ndarray)
        sanitycheck(X_test, np.ndarray)
        sanitycheck(extensions, np.ndarray)
        WARN = False

        if ((len(X_train[0]) != 2) and (len(X_train[-1]) != 2)
                and (len(X_test[1]) != 2) and (len(X_test[-2]) != 2)):
            raise ValueError(
                "Grid methods actually works only for 2-d isotropic lattices!")

        X_train_lex = self._lexico(X_train, extensions)

        self.neighbors_idx = np.zeros([X_test.shape[0], self.__k], dtype=int)
        self.neighbors_dist = np.zeros([X_test.shape[0], self.__k])
        nk_temp = np.zeros(self.__k, int)
        nk_dist_temp = np.zeros(self.__k)

        m = 0
        for i in X_test:

            neib = self.__k
            shell = 1
            l = 0

            while neib > 0:

                points, dist = self._shell_neighbor(shell, i)

                for j, k in zip(self._lexico(points, extensions), dist):

                    where = np.argwhere(X_train_lex == j)

                    if (len(where) and neib > 0):
                        nk_temp[l] = where
                        nk_dist_temp[l] = k
                        l += 1
                        neib -= 1

                shell += 1
            if shell == 4:
                WARN = True
            self.neighbors_idx[m] = nk_temp
            self.neighbors_dist[m] = nk_dist_temp
            m += 1
        if WARN:
            print(
                "WARNING: grid method becomes not exact: exceeding the 3rd shell results in approximate definition of neighbors!"
            )
Ejemplo n.º 6
0
Archivo: myknn.py Proyecto: pretidav/ML
    def _my_kdtree(self, data, leafsize):
        """
            Build a kd-tree for O(n log n) nearest neighbour search. It is employed when
            the option "kd_tree" is applied.
            It is really helpful when the number of missing instances is large, and the serial knn, or
            the exact grid method fail.

            Parameters
            ----------
            
            data: numpy-array-like, shape = [ndata,ndim]
            array containing training instances features.

            leafsize: int type
            max. number of data points to leave in a leaf (advisable to use always at least 5*k_neighbors)

            Returns
            -------
            
            kd-tree:  list of tuples.
            list of tuples representing tree nodes (hyperrectangles) and leaves.        
        """

        sanitycheck(data, np.ndarray)

        ndim = data.shape[1]
        ndata = data.shape[0]

        # find upper and lower bound in feature space
        hrect = np.zeros((2, ndim))
        hrect[0, :] = data.min(axis=0)
        hrect[1, :] = data.max(axis=0)

        # create the root of kd-tree
        idx = np.argsort(data[:, 0])
        data = data[idx]
        splitval = data[ndata // 2, 0]

        left_hrect = hrect.copy()
        right_hrect = hrect.copy()
        left_hrect[1, 0] = splitval
        right_hrect[0, 0] = splitval

        tree = [(None, None, left_hrect, right_hrect, None, None)]

        stack = [(data[:ndata // 2, :], idx[:ndata // 2], 1, 0, True),
                 (data[ndata // 2:, :], idx[ndata // 2:], 1, 0, False)]

        # recursively split data in halves using hyper-rectangles:
        while stack:

            # pop data off stack
            data, didx, depth, parent, leftbranch = stack.pop()
            ndata = data.shape[0]
            nodeptr = len(tree)

            # update parent node

            _didx, _data, _left_hrect, _right_hrect, left, right = tree[parent]

            tree[parent] = (_didx, _data, _left_hrect, _right_hrect, nodeptr, right) if leftbranch \
                else (_didx, _data, _left_hrect, _right_hrect, left, nodeptr)

            # insert node in kd-tree

            # leaf node?
            if ndata <= leafsize:
                _didx = didx.copy()
                _data = data.copy()
                leaf = (_didx, _data, None, None, 0, 0)
                tree.append(leaf)

            # not a leaf, split the data in two
            else:
                splitdim = depth % ndim
                idx = np.argsort(data[:, splitdim])
                data = data[idx]
                didx = didx[idx]
                nodeptr = len(tree)
                stack.append((data[:ndata // 2, :], didx[:ndata // 2],
                              depth + 1, nodeptr, True))
                stack.append((data[ndata // 2:, :], didx[ndata // 2:],
                              depth + 1, nodeptr, False))
                splitval = data[ndata // 2, splitdim]
                if leftbranch:
                    left_hrect = _left_hrect.copy()
                    right_hrect = _left_hrect.copy()
                else:
                    left_hrect = _right_hrect.copy()
                    right_hrect = _right_hrect.copy()
                left_hrect[1, splitdim] = splitval
                right_hrect[0, splitdim] = splitval
                # append node to tree
                tree.append((None, None, left_hrect, right_hrect, None, None))

        return tree