def fit(X, k, do_plot=False):
    N, D = X.shape
    y = np.ones(N)

    medians = np.zeros((k, D))
    # random initialization
    for kk in range(k):
        i = np.random.randint(N)
        medians[kk] = X[i]

    dist = np.zeros((N, k))
    while True:
        y_old = y

        # Compute L1 distance to each median
        for n in range(N):
            current_obj = X[n, ]
            for i in range(k):
                current_median = medians[i, ]
                distance = np.abs(current_obj[0] - current_median[0]) + np.abs(
                    current_obj[1] - current_median[1])
                # print distance
                dist[n, i] = distance

        dist[np.isnan(dist)] = np.inf
        y = np.argmin(dist, axis=1)

        # Update medians
        for kk in range(k):
            # medians[kk] = X[y==kk].median(axis=0)
            cluster = X[y == kk]
            median_x = np.median(cluster, axis=0)
            # print median_x
            medians[kk] = median_x

        changes = np.sum(y != y_old)
        print('Running K-medians, changes in cluster assignment = {}'.format(
            changes))

        # Stop if no point changed cluster
        if changes == 0:
            break

    model = dict()
    model['medians'] = medians
    model['predict'] = predict
    model['error'] = error

    if do_plot and D == 2:
        utils.plot_2dclustering(X, y)
        print("Displaying figure...")
        plt.show()

    return model
        def closure_1_3_1():
            k = 4
            best_model = None
            min_error = np.inf
            for i in range(50):
                model = Kmeans(k)
                model.fit(X)
                error = model.error(X)
                if error < min_error:
                    min_error = error
                    best_model = model

            plt.figure()
            utils.plot_2dclustering(X, best_model.predict(X))

            fname = os.path.join("..", "figs",
                                 "kmeans_outliers_best_model.png")
            plt.savefig(fname)
            print("\nFigure saved as '%s'" % fname)
Beispiel #3
0
def fit(X, k, do_plot=False):
    N, D = X.shape
    y = np.ones(N)

    medians = np.zeros((k, D))
    for kk in range(k):
        i = np.random.randint(N)
        medians[kk] = X[i]

    while True:
        y_old = y

        # Compute distance to each median
        for n in range(N):
            dist1 = np.absolute(np.sum(X[n, :]) - np.sum(medians, axis=1))
            y[n] = np.argmin(dist1)

        medians = np.zeros((k, D))
        # Update medians
        for kk in range(k):
            medians[kk] = np.median(X[y == kk], axis=0)

        changes = np.sum(y != y_old)
        print('Running K-medians, changes in cluster assignment = {}'.format(
            changes))

        # Stop if no point changed cluster
        if changes == 0:
            break

    if do_plot and D == 2:
        utils.plot_2dclustering(X, y)
        print("Displaying figure...")
        plt.show()

    model = dict()
    model['medians'] = medians
    model['predict'] = predict
    model['error'] = error

    return model
Beispiel #4
0
def fit(X, k, do_plot=False):
    N, D = X.shape
    y = np.ones(N)

    means = np.zeros((k, D))
    for kk in range(k):
        i = np.random.randint(N)
        means[kk] = X[i]

    while True:
        y_old = y

        # Compute euclidean distance to each mean
        dist2 = utils.euclidean_dist_squared(X, means)
        dist2[np.isnan(dist2)] = np.inf
        y = np.argmin(dist2, axis=1)

        means = np.zeros((k, D))
        # Update means
        for kk in range(k):
            means[kk] = X[y == kk].mean(axis=0)

        changes = np.sum(y != y_old)
        print('Running K-means, changes in cluster assignment = {}'.format(changes))

        # Stop if no point changed cluster
        if changes == 0:
            break

    if do_plot and D == 2:
        utils.plot_2dclustering(X, y)
        print("Displaying figure...")
        plt.show()

    model = dict()
    model['means'] = means
    model['predict'] = predict
    model['error'] = error

    return model
                        '--question',
                        required=True,
                        choices=[
                            '1', '1.1', '1.2', '1.3', '1.4', '2', '2.2', '4',
                            '4.1', '4.3'
                        ])

    io_args = parser.parse_args()
    question = io_args.question

    if question == '1':
        X = utils.load_dataset('clusterData')['X']

        model = Kmeans(k=4)
        model.fit(X)
        utils.plot_2dclustering(X, model.predict(X))

        fname = os.path.join("..", "figs", "kmeans_basic.png")
        plt.savefig(fname)
        print("\nFigure saved as '%s'" % fname)

    if question == '1.1':
        X = utils.load_dataset('clusterData')['X']

        # part 1: implement kmeans.error
        # part 2: get clustering with lowest error out of 50 random initialization

        best_model = None
        min_error = np.inf
        for i in range(50):
            model = Kmeans(k=4)
Beispiel #6
0
        print("Testing error =", te_err)

    if question == '3.1':
        X = utils.load_dataset('clusterData')['X']

        model = kmeans.fit(X, k=4)
        low = model['error'](model, X)

        for i in range(49):
            new_model = kmeans.fit(X, k=4)
            err = new_model['error'](new_model, X)
            if err < low:
                model = new_model
                low = err

        utils.plot_2dclustering(X, model['predict'](model, X))
        print("Displaying figure...")
        plt.title("K-Means on clusterData")
        plt.show()

        # part 1: implement kmeans.error
        # part 2: get clustering with lowest error out of 50 random initialization

    if question == '3.2':
        X = utils.load_dataset('clusterData')['X']

        # part 3: plot min error across 50 random inits, as k is varied from 1 to 10

        low = np.zeros(10)

        for k in range(1, 11):