Ejemplo n.º 1
0
def compute_gradient(local_X, local_cluster_labels, local_centroids, lr):
    """
        Compute local gradient
        Input:  local_X, local_cluster_labels, local_centroids as above
                lr - the learning rate (float)

        Output: local_grad - local gradients as list of k many gradients
    """
    m, n = get_data_dims(local_X)
    local_grad = [np.zeros([m, n]) for i, e in enumerate(local_centroids)]
    for x, i in zip(local_X, local_cluster_labels):
        local_grad[i] += lr * (x - local_centroids[i])
    return local_grad
Ejemplo n.º 2
0
def pp_init(local_X, k):
    """Do a version of KM++ initialization"""
    ind = np.random.choice(len(local_X), 1)[0]
    m, n = get_data_dims(local_X)
    X_flat = [np.matrix(x.reshape(1, m*n)) for x in local_X]
    first = X_flat[ind]
    xcopy = copy.deepcopy(local_X)
    del X_flat[ind]
    del xcopy[ind]
    D = [cdist(x, first, metric='correlation')**2 for x in X_flat]
    D = np.array(D).flatten()
    norm = np.sum(D)
    D = np.array([d/norm for d in D])
    remain = [xcopy[i] for i in np.random.choice(len(xcopy), k-1, p=D)]
    return [local_X[ind]] + remain
Ejemplo n.º 3
0
def check_stopping(local_centroids, previous_centroids, epsilon):
    """
        Check if centroids have changed beyond some epsilon tolerance

        Input: local_centroids as above
                previous_centroids, the centroids from the prior iteration
                epsilon - the tolerance threshold (float)

        Output: True if delta is above the threshold, else False
    """
    m, n = get_data_dims(local_centroids)
    flat_centroids = [np.matrix(w.reshape(1, m*n)) for w in local_centroids]
    flat_previous = [np.matrix(w.reshape(1, m*n)) for w in previous_centroids]
    # delta is the change in centroids, computed by distance metric
    delta = np.sum([cdist(w, flat_previous[k], metric='correlation')
                    for k, w in enumerate(flat_centroids)])
    return delta > epsilon, delta
Ejemplo n.º 4
0
def compute_mean(local_X, local_cluster_labels, k):
    """
        Compute the local mean, which is broadcast back to the aggregator

        Input: local_X, local_cluster_labels, k as above

        Output: list of k many local mean matrices, shape m x n
    """
    m, n = get_data_dims(local_X)
    npinf = np.zeros([m, n])
    local_means = [[] for i in range(k)]
    for i in range(len(local_cluster_labels)):
        local_means[local_cluster_labels[i]] += [local_X[i]]

    #  Return the origin if no clusters have been assigned to cluster k
    #  !!! is this the way to handle this?
    return [np.mean(lmean, 0) if lmean else npinf for lmean in local_means]
Ejemplo n.º 5
0
def compute_clustering(local_X, local_centroids):
    """
        Compute local clustering by associating each data instance with the
        nearest centroid

        Input: local_X, centroids as above

        Output: cluster_labels- a list of N many integers,
                                    the labels for each instance
    """
    cluster_labels = []
    m, n = get_data_dims(local_X)
    X_flat = [np.matrix(x.reshape(1, m*n)) for x in local_X]
    w_flat = [np.matrix(w.reshape(1, m*n)) for w in local_centroids]
    for x in X_flat:
        distances = [cdist(x, w, metric='correlation') for w in w_flat]
        min_index = distances.index(np.min(distances))
        cluster_labels.append(min_index)
    return cluster_labels
Ejemplo n.º 6
0
def compute_mean(local_X, local_cluster_labels, k):
    """
        Compute the local mean, which is broadcast back to the aggregator

        Input: local_X, local_cluster_labels, k as above

        Output: list of k many local mean matrices, shape m x n
    """
    m, n = get_data_dims(local_X)
    npinf = np.zeros([m, n])
    local_means = [np.zeros([m, n]) for i in range(k)]
    local_counts = [0]*k
    for i, label in enumerate(local_cluster_labels):
        local_means[label] += local_X[i]
        local_counts[label] += 1

    #  Return the origin if no clusters have been assigned to cluster k
    #  !!! is this the way to handle this?
    return [lmean/lcount if lcount > 0 else npinf for lmean, lcount in zip(local_means, local_counts)]
Ejemplo n.º 7
0
def main(X,
         k,
         optimization='lloyd',
         s=2,
         epsilon=0.00001,
         shuffle=True,
         lr=0.001,
         verbose=True):
    m, n = get_data_dims(X)
    nodes, inds = split_over_nodes(X, s, shuffle=shuffle)
    X = [X[i] for i in inds]  # Reshuffle x to match the random
    tracked_delta = []
    num_iter = 0
    not_converged = True

    # Have each site compute k initial clusters locally
    local_centroids = [
        cent for node in nodes
        for cent in local.initialize_own_centroids(node, k)
    ]
    # and select k random clusters from the s*k pool
    np.random.shuffle(local_centroids)
    remote_centroids = local_centroids[:k]

    # Remote Optimization Loop
    while not_converged:
        cluster_labels = [None for j in range(s)]  # the clusterings
        local_optimizer = [None for j in range(s)]  # the optimization entity

        # Local computation loop
        for i, node in enumerate(nodes):
            # Each site compute local clusters
            cluster_labels[i] = \
                        local.compute_clustering(node, remote_centroids)
            if optimization == 'lloyd':
                # Lloyd has sites compute means locally
                local_optimizer[i] = local.compute_mean(
                    node, cluster_labels[i], k)
            elif optimization == 'gradient':
                # Gradient descent has sites compute gradients locally
                local_optimizer[i] = \
                    local.compute_gradient(node, cluster_labels[i],
                                               remote_centroids, lr)
        # End of Local Computations

        # Both objects can be aggregated by taking a sum
        remote_optimizer = remote.aggregate_sum(local_optimizer)
        if optimization == 'lloyd':
            # and for the mean, we need to further divide the number of sites
            remote_optimizer = [r / s for r in remote_optimizer]

            # Then, update centroids as corresponding to the local mean
            [remote_centroids, previous] = \
                local.mean_step(remote_optimizer,
                                    remote_centroids)
        elif optimization == 'gradient':
            # Then, update centroids according to one step of gradient descent
            [remote_centroids, previous] = \
                local.gradient_step(remote_optimizer, remote_centroids)

        # Check the stopping condition "locally" at the aggregator
        # - returns false if converged
        remote_check, delta = local.check_stopping(remote_centroids, previous,
                                                   epsilon)
        if verbose:
            print("Multi-Shot %s ; iter : %d delta : %f" %
                  (optimization, num_iter, delta))
        not_converged = remote_check
        tracked_delta.append(delta)
        num_iter += 1

    # Compute the final clustering "locally" at the aggregator
    cluster_labels = [
        clusters for node in nodes
        for clusters in local.compute_clustering(node, remote_centroids)
    ]
    return {
        'centroids': remote_centroids,
        'cluster_labels': cluster_labels,
        'X': X,
        'delta': tracked_delta,
        'num_iter': i,
        'name': 'multishot_%s' % optimization
    }
Ejemplo n.º 8
0
def main(X, k, optimization='lloyd', s=2, epsilon=0.00001, shuffle=True,
         lr=0.01, verbose=True):
    """
        Local Variables - X: a list of N many m x n matrices storing data
                          k: number of clusters (int)
                          local_centroids : a s x k 2-d list of
                          m x n matrices storing cluster centroids
    """
    m, n = get_data_dims(X)
    nodes, inds = split_over_nodes(X, s, shuffle=shuffle)
    X = [X[i] for i in inds]  # Reshuffle x to match the random
    tracked_delta = []
    num_iter = 0
    not_converged = True

    # Have each site compute k initial clusters locally
    local_centroids = [local.initialize_own_centroids(node, k)
                       for node in nodes]

    # Local Optimization Loop
    while not_converged:
        cluster_labels = [None for j in range(s)]  # the clusterings
        local_delta = [None for j in range(s)]  # Track all local delta
        local_stop = [False for j in range(s)]  # And all local stopping conds
        for i, node in enumerate(nodes):
            # Each local site computes its cluster
            cluster_labels[i] = \
                         local.compute_clustering(node, local_centroids[i])
            if optimization == 'lloyd':
                # Computes its local mean if doing lloyd, and updates centroids
                local_means = local.compute_mean(node,
                                                     cluster_labels[i], k)
                [local_centroids[i], previous_centroids] = \
                    local.mean_step(local_means,
                                        local_centroids[i])
            elif optimization == 'gradient':
                # Computes the local gradient if doing GD, and takes a GD step
                local_grad = local.compute_gradient(node,
                                                        cluster_labels[i],
                                                        local_centroids[i],
                                                        lr)
                [local_centroids[i], previous_centroids] = \
                    local.gradient_step(local_grad, local_centroids[i])
            # Check local stopping conditions
            local_stop[i], local_delta[i] = \
                local.check_stopping(local_centroids[i],
                                         previous_centroids, epsilon)
        num_iter += 1
        tracked_delta.append(local_delta)
        if verbose:
            print("Single-Shot %s ; iter : %d delta : %f"
                  % (optimization, num_iter, max(local_delta)))

        # if any of the sites are still iterating, keep the global loop running
        # TODO: we can save computations by locally waiting if local
        #       conditions are met
        not_converged = any(local_stop)

    # Aggregate clusters remotely
    remote_centroids = remote.aggregate_clusters(local_centroids)
    # And compute the final global clustering
    cluster_labels = [clusters for node in nodes for clusters in
                      local.compute_clustering(node, remote_centroids)]
    return {'centroids': remote_centroids, 'cluster_labels': cluster_labels,
            'X': X, 'delta': tracked_delta, 'iter': num_iter,
            'name': 'singleshot_%s' % optimization}