Esempio n. 1
0
    def compress(self, X, n_components, n_neighbours):
        n = X.shape[0]
        k = self.k
        numNeighbours = self.numNeighbours

        # find the distances to every other point
        euclD = utils.euclidean_dist_squared(X, X)
        euclD = np.sqrt(euclD)

        knnD = np.zeros((n, n))

        # get the KNN of point i
        for i in range(n):
            # finds numNeighbours smallest distances from obj_i
            # +1 because it will always select itself as (distance of 0), and distances are non-negative
            minIndexes = np.argsort(euclD[i])[:numNeighbours + 1]

            for index in minIndexes:
                # add distances of KNN_i to the distance matrix
                knnD[i, index] = euclD[i, index]

        D = np.zeros((n, n))
        # get distance of every other path using only KNN
        for i in range(n):
            for j in range(n):
                if i != j:
                    D[i, j] = utils.dijkstra(knnD, i, j)

        Z = AlternativePCA(k).fit(X).compress(X)
        z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D)
        Z = z.reshape(n, k)
        return Z
Esempio n. 2
0
    def compress(self, X):
        n = X.shape[0]
        k = self.k
        K = self.K

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)
        nbrs = np.argsort(D, axis=1)[:, 1:K + 1]
        G = np.zeros((n, n))

        for i in range(n):
            for j in nbrs[i]:
                G[i, j] = D[i, j]
                G[j, i] = D[j, i]

        D = utils.dijkstra(G)
        D[D == np.inf] = -np.inf
        max = np.max(D)
        D[D == -np.inf] = max

        # Initialize low-dimensional representation with PCA
        Z = PCA(k).fit(X).compress(X)

        # Solve for the minimizer
        z = find_min(self._fun_obj_z, Z.flatten(), 500, False, D)
        Z = z.reshape(n, k)
        return Z
Esempio n. 3
0
    def fit(self, X):
        N, D = X.shape
        y = np.ones(N)

        means = np.zeros((self.k, D))
        for kk in range(self.k):
            i = np.random.randint(N)
            means[kk] = X[i]

        self.means = means

        while True:
            y_old = y

            # Compute euclidean distance to each mean
            dist2 = euclidean_dist_squared(X, means)
            dist2[np.isnan(dist2)] = np.inf
            y = np.argmin(dist2, axis=1)

            # Update means
            for kk in range(self.k):
                if np.any(
                        y == kk
                ):  # don't update the mean if no examples are assigned to it (one of several possible approaches)
                    means[kk] = X[y == kk].mean(axis=0)

            changes = np.sum(y != y_old)
            # print('Running K-means, changes in cluster assignment = {}'.format(changes))

            # Stop if no point changed cluster
            self.error(X)
            if changes == 0:
                break

        self.means = means
Esempio n. 4
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)

        sorted_indices = np.argsort(D)
        G = np.zeros((n, n))

        for i in range(D.shape[0]):
            for j in range(self.nn + 1):
                G[i, sorted_indices[i, j]] = D[i, sorted_indices[i, j]]
                G[sorted_indices[i, j], i] = D[sorted_indices[i, j], i]

        dist = utils.dijkstra(G)

        dist[np.isinf(dist)] = dist[~np.isinf(dist)].max()

        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.compress(X)

        # Solve for the minimizer
        z, f = findMin(self._fun_obj_z, Z.flatten(), 500, dist)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 5
0
    def predict(self, Xtest):
        n_train = self.X.shape[0]
        n_test = Xtest.shape[0]

        # dist_squared will be a n_test by  n_train numpy array
        # utils.euclidean_dist_squared takes args (X, Xtest)
        # but this yields an array with the size of arg X as the first dimension
        # which I don't want for the following operations
        dist_squared = utils.euclidean_dist_squared(Xtest, self.X)

        # indices of the array, sorted by the array's values
        sorted_indices = np.argsort(dist_squared)

        out = np.zeros(n_test)

        # assumes that both n_train and n_test are >= self.k
        for i in range(sorted_indices.shape[0]):
            indices = sorted_indices[i, :self.k]

            # maps from index in sorted indices to training y val
            values = np.fromiter((self.y[j] for j in indices), int)
            value_sum = np.sum(values)

            # this implementation favors 0 in the case of a tie
            if value_sum > self.k / 2:
                # out is zeroes to begin with, so we only need to set in the true case
                out[i] = 1

        return out
Esempio n. 6
0
    def fit(self, X, y):
        """
        Parameters
        ----------
        X : an N by D numpy array
        y : an N by 1 numpy array of integers in {1,2,3,...,c}
        """

        Xcondensed = X[0:1, :]
        ycondensed = y[0:1]

        for i in range(1, len(X)):
            x_i = X[i:i + 1, :]
            dist2 = utils.euclidean_dist_squared(Xcondensed, x_i)
            inds = np.argsort(dist2[:, 0])
            yhat = utils.mode(ycondensed[inds[:min(self.k, len(Xcondensed))]])

            if yhat != y[i]:
                Xcondensed = np.append(Xcondensed, x_i, 0)
                ycondensed = np.append(ycondensed, y[i])

        self.X = Xcondensed
        self.y = ycondensed

        print(self.y.shape[0])
Esempio n. 7
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)
        # D is symmetric matrix

        geoD = np.zeros((n, n))

        # find nn-neighbours
        for i in range(n):
            sort = np.argsort(D[:, i])
            neigh = np.setdiff1d(sort[0:self.nn + 1], i)
            # find the nn+1 smallest indexes that are not i
            for j in range(len(neigh)):
                t = neigh[j]
                geoD[i, t] = D[i, t]
                geoD[t, i] = D[t, i]

        D = utils.dijkstra(geoD)
        # for disconnected vertices (distance is Inf)
        # set their dist = max_dist(graph)
        # to encourage they are far away from each other
        D[np.isinf(D)] = D[~np.isinf(D)].max()

        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.compress(X)

        # Solve for the minimizer
        z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 8
0
def fit(X, y, k):
    """
    Parameters
    ----------
    X : an N by D numpy array
    y : an N by 1 numpy array of integers in {1,2,3,...,c}
    k : the k in k-NN
    """
    # Just memorize the training dataset
    
    N, D = X.shape 
   
    Xcond = X[0,None]
    
    ycond = y[0,None]
    ncond = 1
    
    for i in range (N):#go through subsequent training example 
            #y_pred = predict(X,Xtest)
        dist = utils.euclidean_dist_squared(Xcond,X[i,:])
        ds = np.argsort(dist, axis=0)
        y_pred =  stats.mode(ycond[ds[:min(k,ncond)]])[0][0]
    
        if y_pred != y[i]:#if the example is incorrectly classified by the KNN classifier using the current subset then
            Xcond = np.append(Xcond,X[i,None],axis=0)
            ycond = np.append(ycond,y[i,None],axis=0)    
            ncond = ncond + 1
            
        
    model = dict()
    model['X'] = Xcond
    model['y'] = ycond
    model['k'] = k
    model['predict'] = predict
    return model
Esempio n. 9
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X,X)
        D = np.sqrt(D)

        #TODO:
        D = self.construct_dist_graph(X , D)

        # If two points are disconnected (distance is Inf)
        # then set their distance to the maximum
        # distance in the graph, to encourage them to be far apart.
        D[np.isinf(D)] = D[~np.isinf(D)].max()


        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.compress(X)

        # Solve for the minimizer
        z,f = findMin(self._fun_obj_z, Z.flatten(), 500, D)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 10
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)

        # TODO: Convert these Euclidean distances into geodesic distances
        order = np.argsort(D, axis=1)[:, :self.k + 1]
        distance_mask = np.zeros(D.shape)
        for i in range(n):
            for j in order[i]:
                distance_mask[i, j] = 1
                distance_mask[j, i] = 1
        D = utils.dijkstra(D * distance_mask)

        # If two points are disconnected (distance is Inf)
        # then set their distance to the maximum
        # distance in the graph, to encourage them to be far apart.
        D[np.isinf(D)] = D[~np.isinf(D)].max()

        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.transform(X)

        # Solve for the minimizer
        z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 11
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)

        # Convert these Euclidean distances into geodesic distances
        sorted_dist_indices = np.argsort(D)
        G = np.zeros((n, n))
        for i in range(n):
            for j in range(self.nn):
                G[i, sorted_dist_indices[i, j]] = D[i, sorted_dist_indices[i,
                                                                           j]]
                G[sorted_dist_indices[i, j], i] = D[sorted_dist_indices[i, j],
                                                    i]
        D = utils.dijkstra(G)

        # If two points are disconnected (distance is Inf)
        # then set their distance to the maximum
        # distance in the graph, to encourage them to be far apart.
        D[np.isinf(D)] = D[~np.isinf(D)].max()

        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.transform(X)

        # Solve for the minimizer
        z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 12
0
 def error(self, X, means=None):
     if means is None:
         means = self.means
     dist = np.sqrt(euclidean_dist_squared(X, means))
     minVal = np.amin(dist, axis=1)
     # print(np.sum(minVal))
     return np.sum(minVal)
Esempio n. 13
0
    def fit(self, X):
        N, D = X.shape
        y = np.ones(N)

        error = None

        means = np.zeros((self.k, D))
        for kk in range(self.k):
            i = np.random.randint(N)
            means[kk] = X[i]

        while True:
            y_old = y

            # Compute euclidean distance to each mean
            dist2 = euclidean_dist_squared(X, means)
            dist2[np.isnan(dist2)] = np.inf
            y = np.argmin(dist2, axis=1)

            # Update means
            for kk in range(self.k):
                means[kk] = X[y == kk].mean(axis=0)

            changes = np.sum(y != y_old)
            # print('Running K-means, changes in cluster assignment = {}'.format(changes))
            self.means = means
            # Stop if no point changed cluster
            if changes == 0:
                error = self.error(X)
                break

        self.means = means
        return means
    def predict(self, Xtest):
        #Compute the Euclidean distance
        N, D = self.X.shape
        T, D = Xtest.shape

        y_pred = np.zeros((T, self.y.shape[1]))
        if self.method == "L2":
            distance = utils.euclidean_dist_squared(self.X, Xtest)
        elif self.method == "cosine":
            distance =  utils.cosine_similarity(self.X, Xtest)
            #print(distance.shape)
        elif self.method == "pearson":
            distance = utils.pearson_corr(self.X, Xtest)
            #print(distance.shape)
        for t in range(T):
            sorted_distance_k =  np.argsort(distance[:, t])[:self.k]
            #print(sorted_distance_k)
            for l in range(self.labels):
                #calculate the conditional probability that P(y_j = 1|x)
                p = (1/self.k)*np.sum(self.y[:,l][sorted_distance_k])
                #print(p)
                if p>0.5:
                    y_pred[t,l] = 1
                else:
                    y_pred[t, l] = 0
        	    #y_pred[t] = utils.mode(self.y[sorted_distance_k] )
        	
        return y_pred
Esempio n. 15
0
    def compress(self, X):
        n = X.shape[0]
        # nearest_neighbours = np.zeros((n, self.nn))

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)

        # If two points are disconnected (distance is Inf)
        # then set their distance to the maximum
        # distance in the graph, to encourage them to be far apart.

        adjacency_matrix = np.zeros((n, n))
        nearest_neighbours = self.knn(X)
        for i, j in enumerate(nearest_neighbours):
            for neighbour in j:
                adjacency_matrix[i, neighbour] = D[i, neighbour]
                adjacency_matrix[neighbour, i] = D[neighbour, i]

        dijkstra = utils.dijkstra(adjacency_matrix)

        dijkstra[np.isinf(dijkstra)] = dijkstra[~np.isinf(dijkstra)].max()
        # Initialize low-dimensional representation with PCA
        Z = PCA(self.k).fit(X).compress(X)

        # Solve for the minimizer
        z = find_min(self._fun_obj_z, Z.flatten(), 500, False, dijkstra)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 16
0
    def predict(self, Xtest):
        # print('k = %i' % self.k)
        # distArray = utils.euclidean_dist_squared(self.X, Xtest)

        # sidx = distArray.argsort(axis=0)
        # arrangedDistArray = distArray[sidx, np.arange(sidx.shape[1])]

        # y_pred = []
        # for i in range(arrangedDistArray.shape[1]):
        #     targets = []
        #     for j in range(self.k):

        #         dist = arrangedDistArray[j][i]
        #         index = np.nonzero(distArray[:,i] == dist)[0]
        #         targets.append(self.y[index])

        #     targetNPArray = np.array(targets)
        #     y_pred.append(utils.mode(targetNPArray))
        distances = utils.euclidean_dist_squared(self.X, Xtest)
        sorted_indexes = np.argsort(distances, axis=0)
        sorted_indexes = sorted_indexes[:self.k, :]
        y_pred = self.y[sorted_indexes]
        y_pred = stats.mode(y_pred)[0]

        return y_pred
Esempio n. 17
0
File: knn.py Progetto: danre07/340a2
    def predict(self, Xtest):
        # from utils: Computes the Euclidean distance between rows of 'X' and rows of 'Xtest'
        # return N by T array with pairwise squared Euclidean distances
        dist_squared = utils.euclidean_dist_squared(self.X, Xtest)
        # sort dist_squared by squared distance
        idx = np.argsort(dist_squared, axis=0)
        # restrict to k nearest in X
        # idx_k = idx[:,:self.k]

        y_pred = []

        n, d = Xtest.shape

        # iterate through each test entry
        for i in range(0, n):
            # y values of neighbors
            y_neighbors = []
            # iterate through the neighbor
            for j in range(0, self.k):
                # add y associated with k-th neighbor
                idx_neighbor = idx[j][i]
                y_neighbors = np.append(y_neighbors, self.y[idx_neighbor])
            # get most common y
            y_mode = stats.mode(y_neighbors)
            # add most common label to predicted values
            y_pred = np.append(y_pred, y_mode)

        # print(y_pred)
        return np.array(y_pred)
Esempio n. 18
0
    def compress(self, X):
        n = X.shape[0]

        # Compute Euclidean distances
        D = utils.euclidean_dist_squared(X, X)
        D = np.sqrt(D)

        # Construct nearest neighbour graph
        G = np.zeros([n, n])
        for i in range(n):
            neighbours = np.argsort(D[i])[:self.nn + 1]
            for j in neighbours:
                G[i, j] = D[i, j]
                G[j, i] = D[j, i]

        # Compute ISOMAP distances
        D = utils.dijkstra(G)

        # If two points are disconnected (distance is Inf)
        # then set their distance to the maximum
        # distance in the graph, to encourage them to be far apart.
        D[np.isinf(D)] = D[~np.isinf(D)].max()

        # Initialize low-dimensional representation with PCA
        pca = PCA(self.k)
        pca.fit(X)
        Z = pca.compress(X)

        # Solve for the minimizer
        z, f = findMin(self._fun_obj_z, Z.flatten(), 500, D)
        Z = z.reshape(n, self.k)
        return Z
Esempio n. 19
0
 def error(self, X):
     # get closest indices from predict()
     indices = self.predict(X)
     
     total = 0
     for i in range(self.means.shape[0]):
         total += np.sum(euclidean_dist_squared(X[indices == i], self.means[[i]]))
     return total
Esempio n. 20
0
    def error(self, X):

        N, D = X.shape
        #print(self.means.shape)
        #print(X.shape)

        d = euclidean_dist_squared(X, self.means)
        return np.sum(d.min(1))
Esempio n. 21
0
 def error(self, X):
     retval = 0
     y = self.predict(X)
     dist = euclidean_dist_squared(X, self.means)
     for i in range(len(y)):
         idx = y[i]
         retval += dist[i][idx]**.5
     return retval
Esempio n. 22
0
    def error(self, X):
        means = self.means
        d = self.predict(X)

        error = 0
        for i in range(means.shape[0]):
            error += np.sum(euclidean_dist_squared(X[d == i], means[[i]]))

        return error
Esempio n. 23
0
 def predict(self, X):
     """
     prediction entry point where linear algebra is used to measure group distance
     located groups - of dataset points.
     """
     medians = self.medians
     dist2 = utils.euclidean_dist_squared(X, medians)
     dist2[np.isnan(dist2)] = np.inf
     return np.argmin(dist2, axis=1)
Esempio n. 24
0
 def predict(self, Xtest):
     T, D = Xtest.shape
     N, D = self.X.shape
     dists = utils.euclidean_dist_squared(self.X, Xtest)
     sortedDists = np.argsort(dists, axis=0)
     y_pred = np.empty(T)
     for ti in range(T):
         y_pred[ti] = stats.mode(self.y[sortedDists[:self.k, ti]])[0][0]
     return y_pred
def predict(model, X):
    means = model['means']
    dist2 = utils.euclidean_dist_squared(X, means)

    # print np.argmin(dist2, axis=1)

    dist2[np.isnan(dist2)] = np.inf

    return np.argmin(dist2, axis=1)
Esempio n. 26
0
    def error(self, X):
        means = self.means
        y = self.predict(X)

        tot_dist_error = 0
        for kk in range(means.shape[0]):
            tot_dist_error += np.sum(
                utils.euclidean_dist_squared(X[y == kk], means[[kk]]))

        return tot_dist_error
 def error(self, X):
     N, D = X.shape
     means = self.means
     dist2 = euclidean_dist_squared(X, means)
     dist2[np.isnan(dist2)] = np.inf
     y = self.predict(X)
     dist_error = 0
     for n in range(N):
         dist_error += dist2[n, y[n]]
     return dist_error
Esempio n. 28
0
    def predict(self, Xtest):
        X = self.X
        k = self.k
        y = self.y

        y_pred = np.zeros(Xtest.shape[0])
        euclidean_distances = np.argsort(utils.euclidean_dist_squared(
            Xtest, X))
        for n in range(Xtest.shape[0]):
            y_pred[n] = utils.mode(y[euclidean_distances[n, 0:k]])
        return y_pred
Esempio n. 29
0
File: cnn.py Progetto: jaysc96/CS340
def predict(model, Xtest):
    X = model['X']
    y = model['y']
    k = model['k']

    D = utils.euclidean_dist_squared(X, Xtest)
    D = np.argsort(D, axis=0)
    D = D[0:k, :]
    Y = y[D]
    yhat = np.amax(Y, axis=0)
    return yhat
Esempio n. 30
0
    def error(self, X):
        N, D = X.shape
        medians = self.medians
        closest_median_indexes = self.predict(X)

        error = 0
        for i in range(medians.shape[0]):
            error += np.sum(
                utils.euclidean_dist_squared(X[closest_median_indexes == i],
                                             medians[[i]]))
        return error