コード例 #1
0
def main():
    path = sys.argv[1]

    csvManager = CSVManager()
    df = csvManager.read(path)

    df = csvManager.replaceNan(df)

    formattedCSV = csvManager.deleteObjectColumns(df)
    matrix = csvManager.convertCSVToMatrix(formattedCSV)

    try:
        for k in range(2, 5):
            kmeans = KMeans(k)
            kmeans.fit(matrix)

            for centroid in kmeans.centroids:
                plt.scatter(kmeans.centroids[centroid][0], kmeans.centroids[centroid][1],
                            marker="o", color="k", s=150, linewidths=5)

            for classification in kmeans.classifications:
                color = randomColor()
                for featureset in kmeans.classifications[classification]:
                    plt.scatter(featureset[0], featureset[1],
                                marker="x", color=color, s=60, linewidths=2)

            plt.show()

            confusionMatrix, purity = kmeans.purity()
            saveData(confusionMatrix, purity, path, k)
    except Exception:
        print("An empty cluster was found, please run the program again. This program does not handle empty clusters")
コード例 #2
0
ファイル: GaussianMixtureModel.py プロジェクト: M1F1/k_mean
    def __init__(self, cluster_number: int, data: np.ndarray):
        k_means = KMeans(data=data, cluster_number=cluster_number)
        k_means.fit()
        k_means.visualize()

        self.cluster_number = cluster_number
        self.data = data

        self.clusters_means = np.zeros((cluster_number, data.shape[1]))
        self.clusters_means = k_means.cluster_data_means

        self.clusters_priors = np.zeros((1, cluster_number))
        self.clusters_priors = np.sum(k_means.cluster_assignment_matrix, axis=0) / data.shape[0]

        self.clusters_covariances = np.zeros((cluster_number, data.shape[1], data.shape[1]))
        # maybe something wrong with covariance (scalar value)
        # sigma = 1/ N - 1  not biases estimator
        for k in range(self.cluster_number):
            idx = np.nonzero(k_means.cluster_assignment_matrix[:, k])[0]
            cov_data = (data[idx] - self.clusters_means[k])

            self.clusters_covariances[k] = np.dot(cov_data.T, cov_data) / idx.shape[0]

        self.clustered_data = np.zeros((data.shape[0], data.shape[1] + 1))
        self.cluster_probability_matrix = np.zeros((self.data.shape[0], cluster_number))
コード例 #3
0
    def fit(self, data):
        kmeans = KMeans(n_clusters=self.n_clusters)
        kmeans.fit(data)

        candidate = []
        for k in kmeans.centroids:
            candidate.append(kmeans.centroids[k])
        candidate = np.array(candidate).ravel()

        self.dim = data.shape[1]
        self.pso = PSO(dim=self.dim * self.n_clusters,
                       minf=0,
                       maxf=1,
                       swarm_size=self.swarm_size,
                       n_iter=self.n_iter,
                       w=self.w,
                       lb_w=self.lb_w,
                       c1=self.c1,
                       c2=self.c2)
        self.pso.set_candidate(candidate)
        self.pso.optimize(self.__objective_function,
                          customizable=True,
                          dim=self.dim,
                          n_clusters=self.n_clusters,
                          data=data)

        self.centroids = {}
        raw_centroids = self.pso.global_optimum.pos.reshape(
            (self.n_clusters, self.dim))

        for centroid in range(len(raw_centroids)):
            self.centroids[centroid] = raw_centroids[centroid]
コード例 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help="path to dataset")
    parser.add_argument('--k', type=int, default=3,
                        help="quantity of clusters (default 3)")
    parser.add_argument('--it', type=int, default=100,
                        help="max iterations (default 100)")
    parser.add_argument('--tol', type=float, default=0.001,
                        help="tolerance (default 0.001)")
    args = parser.parse_args()

    csvManager = CSVManager()
    df = csvManager.read(args.path)

    df = csvManager.replaceNan(df)

    formattedCSV = csvManager.deleteObjectColumns(df)
    matrix = csvManager.convertCSVToMatrix(formattedCSV)

    kmeans = KMeans(args.k, args.it, args.tol)

    kmeans.fit(matrix)

    for centroid in kmeans.centroids:
        plt.scatter(kmeans.centroids[centroid][0], kmeans.centroids[centroid][1],
                    marker="o", color="k", s=150, linewidths=5)

    for classification in kmeans.classifications:
        color = randomColor()
        for featureset in kmeans.classifications[classification]:
            plt.scatter(featureset[0], featureset[1],
                        marker="x", color=color, s=60, linewidths=2)

    plt.show()
コード例 #5
0
def main():
    path = sys.argv[1]

    csvManager = CSVManager()
    df = csvManager.read(path)

    df = csvManager.replaceNan(df)

    formattedCSV = csvManager.deleteObjectColumns(df)

    formattedCSV = csvManager.deleteObjectColumns(df)
    matrix = csvManager.convertCSVToMatrix(formattedCSV)

    try:
        with open('result/result.txt', 'w') as file:
            res = ''
            for k in range(2, 5):
                kmeans = KMeans(k)
                kmeans.fit(matrix)

                simplifiedSilhouette = SimplifiedSilhouette(
                    formattedCSV, kmeans)
                sswc = simplifiedSilhouette.calculate()
                res += 'K = ' + str(k) + '; ' + 'SSWC = ' + str(sswc) + '\n'
            file.write(res)

    except Exception:
        print("An empty cluster was found, please run the program again. This program does not handle empty clusters")
コード例 #6
0
def treeClassification(data):

    # pca = PCA(n_components=2)
    # pca_data = pca.fit_transform(data)

    km = KMeans(n_clusters=6, max_iter=200)
    km.fit(data.values, True)

    # km = KMeans(n_clusters=6)
    # clusters = km.fit_predict(data)

    cluster_report(data, km.prediction)
コード例 #7
0
def attr_analysis(data):

    km = KMeans(n_clusters=6, max_iter=200)
    km.fit(data.values, True)

    for cluster in km.clusters:
        for i in range(len(cluster.data[0])):
            col = _column(cluster.data, i)
            ax = plt.subplot(3, 6, i + 1)
            ax.set_title(data.columns[i], {'fontsize': 6})
            plt.boxplot(col)

        plt.show()
コード例 #8
0
 def findClustering(self, cluster):
     # Use KMeans to form 2 clusters
     kmeans = KMeans()
     kmeans.setK(2)
     kmeans.fit(cluster)
     predictedClass = kmeans.predict(cluster)
     centroids = kmeans.centroids
     # Compute SSE for the clustering
     sum = 0
     for clusterIndex in range(len(centroids)):
         for element in cluster:
             sum += (np.linalg.norm(
                 np.array(element) - np.array(centroids[clusterIndex])))**2
     return sum, predictedClass
コード例 #9
0
def sse_plot(X, start=2, stop=20):
    inertia = []
    for x in range(start, stop):
        print("====ITERATION:", x)
        km = KMeans(n_clusters=x, max_iter=1000)
        km.fit(X, True)
        inertia.append(km.sum_squared_error())
    plt.figure(figsize=(12, 6))
    plt.plot(range(start, stop), inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.title('Inertia plot with K')
    plt.xticks(list(range(start, stop)))
    plt.show()
コード例 #10
0
 def initialize_(self, X):
     n, p = X.shape
     # kmeans initialization
     if self.initialization_ == 'kmeans':
         kmeans_clstr = KMeans(nr_clusters=self.k_, n_init=1)
         kmeans_clstr.fit(X)
         labels = kmeans_clstr.labels_
         self.cond_prob_ = np.zeros((n, self.k_))
         for i in range(n):
             j = int(labels[i])
             self.cond_prob_[i, j] = 1
     # else randomly initialize them
     else:
         foo = np.random.rand(n, self.k_)
         self.cond_prob_ = foo / np.sum(foo, axis=1)[:, np.newaxis]
コード例 #11
0
def main():

    #load data
    X = handle_data('data2.txt')
    km = KMeans(5)
    km.fit(X)

    #Plotting
    colors = 10 * [
        'gold', 'mediumseagreen', 'orangered', 'lightpink', 'coral',
        'mediumslateblue', 'violet', 'magenta'
    ]
    plt.figure(figsize=(10, 10))

    #plotting each feature by using corresponding color
    for classification in km.classes:
        color = colors[classification]

        #features
        for features in km.classes[classification]:
            plt.scatter(features[0], features[1], color=color, s=10)

        #plt.scatter(np.mean(features[0]), np.mean(features[1]), marker='*', c = 'k',s = 150)

    #Centroid centers
    for centroid in km.centroids:
        plt.scatter(km.centroids[centroid][0],
                    km.centroids[centroid][1],
                    c='k',
                    s=100,
                    marker="x")

    #random inital points
    for l in range(km.k):
        plt.scatter(km.randoms[l][0],
                    km.randoms[l][1],
                    marker='*',
                    c='k',
                    s=100)

    #plot attributes
    plt.legend(['* = Initial random points', 'X = Final cluster centers'])
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.title('k-Means')
    plt.show()
    print('\t\t\tIteration:', km.iterations)
    print('\n\t\t\tk value: ', km.k)
コード例 #12
0
ファイル: main.py プロジェクト: bluesilence/python
def main():
    random_seed = 0
    iteration = 50
    init_method = 'kmeans++'
    X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=random_seed)
    plt.scatter(X[:, 0], X[:, 1], s=4, c='blue')
    kmeans = KMeans()
    #kmeans.fit_range(X, list(range(3, 7)), random_seed=random_seed, iteration=iteration, init_method=init_method)
    
    kmeans.fit(X, 4, random_seed=random_seed, iteration=iteration, init_method=init_method)
    y_pred = kmeans.predict(X)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=4, cmap='viridis')
    centers = kmeans.centroids
    ax.scatter(centers[:, 0], centers[:, 1], c='red', s=15, alpha=0.5)
    plt.show()
コード例 #13
0
def visualization_2d(data):

    # reduce dimesions of dataset based on data variance (PCA)
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(data)

    # Do KMeans for PCA data   n_clusters(6 or 7)
    km = KMeans(n_clusters=6, max_iter=200)
    km.fit(pca_data, True)

    colors = ['red', 'green', 'blue', 'purple', 'orange', 'yellow', 'gray']
    for i in range(len(km.clusters)):
        pc1 = []
        pc2 = []
        for row in km.clusters[i].data:
            pc1.append(row[0])
            pc2.append(row[1])
        plt.scatter(pc1, pc2, c=colors[i], label='cluster ' + str(i))

    plt.show()
コード例 #14
0
ファイル: main.py プロジェクト: chachalaca/K-means
def main():

    km = KMeans(3)

    iris = pd.read_csv("iris.csv")
    data = np.array(
        iris[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values.tolist()
    )

    km.fit(data)

    print("cluster centers: %s" % km.cluster_centers)

    for d in iris.values:
        prediction = km.predict([[
            d[2],
            d[2],
            d[3],
            d[4]
        ]])
        print(d[5]+" - "+str(prediction[0]))
コード例 #15
0
def main():
    dim = 2
    num_class = 3
    dataset_dir = '../input/wine.csv'
    train_x, train_y, raw_data = data_loader(dataset_dir)
    pca = PCA(first_k=dim, use_threshold=False, threshold=0.5)
    proj = pca.fit(train_x)
    kmeans = KMeans(K=num_class)
    center, predict_y = kmeans.fit(proj)
    result = evaluate(proj, train_y, predict_y, k=num_class)
    visualization(center, proj, predict_y, dim)
    save_to_csv(raw_data, predict_y)
    print(result)
コード例 #16
0
def exploratory_analysis(data):
    best_columns = [
        "BALANCE", "PURCHASES", "CASH_ADVANCE", "CREDIT_LIMIT", "PAYMENTS",
        "MINIMUM_PAYMENTS", "PRC_FULL_PAYMENT"
    ]
    # data with best col
    best_data = pd.DataFrame(data[best_columns])

    km = KMeans(n_clusters=6, max_iter=200)
    km.fit(best_data.values, True)

    best_data['cluster'] = km.prediction
    best_columns.append('cluster')

    sb.pairplot(best_data[best_columns],
                hue='cluster',
                x_vars=best_columns,
                y_vars=best_columns,
                height=5,
                aspect=1)

    sb.pairplot(best_data[best_columns],
                hue='cluster',
                x_vars=best_columns[0:4],
                y_vars='cluster',
                height=5,
                aspect=1)

    sb.pairplot(best_data[best_columns],
                hue='cluster',
                x_vars=best_columns[4:7],
                y_vars='cluster',
                height=5,
                aspect=1)

    plt.show()
コード例 #17
0
    def fit_predict(self, X):
        """
        使用数据集,训练一个谱聚类模型,并且对数据集进行聚类

        Parameters
        ----------
        X : class:`ndarray<numpy.ndarray>` of shape (N,M)
            训练集中一共有N个数据,每个数据集具有M个属性
        """

        if self.affinity == "full_link":
            w = self.full_link(X, dist=self.rbf)
        elif self.affinity == "nearest_neighbors":
            w = self.knn_nearest(X)
        norm_laplacians = self.laplacians_matrix(w)

        eigval, eigvec = np.linalg.eig(norm_laplacians)
        ix = np.argsort(eigval)[0:self.n_clusters]
        H = eigvec[:, ix]

        kmeans = KMeans(n_clusters=self.n_clusters)
        kmeans.fit(H)
        pred = kmeans.predict(H)
        return pred
コード例 #18
0
def main():
    logging.basicConfig(filename="result/log.txt",
                        filemode='w',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.DEBUG)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()

    parser.add_argument('-n_clusters', type=int, default=5)
    parser.add_argument('-n_points', type=int, default=100)

    opt = parser.parse_args()

    tester = Tester(n_gaussian_clusters=opt.n_clusters)

    # Generate data from n 2d multivariate gaussian parameters
    data, labels = tester.generate_2d_gaussian_points(
        how_many_per_each_gaussian=opt.n_points)
    logging.info(" Generated {} data points from {} different 2 dimensional "
                 "multivariate gaussian distributions. ({} data points for "
                 "each cluster.)".format(opt.n_clusters * opt.n_points,
                                         opt.n_clusters, opt.n_points))

    # Raw Data
    utils.draw(data, labels, without_label_color=True, means=None,
               title="Data", save="result/raw.png", show=False)
    utils.draw(data, labels, without_label_color=False, means=tester.means,
               title="Gaussian", save="result/gaussian.png", show=False)

    # KMeans Prediction
    kmeans = KMeans(n_cluster=opt.n_clusters)
    prediction_lables, prediction_centers = kmeans.fit(data)
    utils.draw(data, prediction_lables, without_label_color=False,
               means=prediction_centers, title="KMeans",
               save="result/kmeans.png", show=False)

    # Concatenate results
    png_list = ["result/raw.png", "result/gaussian.png", "result/kmeans.png"]
    utils.concatenate_pngs(png_list, "result/final.png")
コード例 #19
0
    image_row = kmeans.BoVW(means, img_vec).reshape((1, 32))
    image = np.concatenate((image, image_row), axis = 0)
    print(f1)
    
np.savetxt('image_data.txt', image)
image = np.loadtxt('image_data.txt')
"""
img = cv2.imread('31.png')
print(img)
img_obj = ImageHandler(img)
x = img_obj.ToShiftedPatches()
x = np.array(x)
np.savetxt('cell_1.txt', x)

kmeans_obj = KMeans(3, x)
kmeans_obj.fit(3, 0.002)

means = kmeans_obj.mean_vec
cov_mat_list = kmeans_obj.CovMatrix()
mixture_coeff = kmeans_obj.MixtureCoeff()

print(cov_mat_list)

"""from sklearn.cluster import KMeans
obj = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 10, random_state = 0)
y_Kmeans = obj.fit_predict(x)
print(obj.cluster_centers_[:])"""

GMM_obj = GMM(3, x, means, cov_mat_list, mixture_coeff)
GMM_obj.fit(0.0002)
コード例 #20
0
import numpy as np
from scipy import io

############# FILE STUFF #############
trainFileMNIST = "./mnist_data/images.mat"

trainMatrix = io.loadmat(trainFileMNIST)  # Dictionary

############# GET DATA #############
print 20 * "#", "Getting Data", 20 * "#"
imageData = np.array(trainMatrix['images'])
imageData = np.rollaxis(imageData, 2, 0)  # move the index axis to be the first

dataShape = np.shape(imageData)
print "Image Data Shape", dataShape

imageDataFlat = []
for elem in imageData:
    imageDataFlat.append(elem.flatten())

dataShape = np.shape(imageDataFlat)
print "Image Data Flat Shape", dataShape

num_clusters = [5, 10, 20]

for cluster in num_clusters:
    print 20 * "#", "Num Clusters:", cluster, 20 * "#"
    KM = KMeans(cluster, max_iter=10)
    KM.fit(imageDataFlat)
    visualize(KM.cluster_centers_, cluster)
コード例 #21
0
ファイル: main.py プロジェクト: olezhko9/ITMO-ML-8-sem
display_clusters(y, "Настоящие метки")


def display_metrics(n_clusters, metrics, title):
    plt.figure(figsize=(8, 6))
    plt.grid(linestyle='--')
    plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r')
    plt.title(title)
    plt.xlabel("Количество кластеров")
    plt.ylabel("Значение метрики")
    plt.show()


external_metrics = []
internal_metrics = []
for i in range(1, 11):
    kMean = KMeans(k=i)
    centroids = kMean.fit(X_norm)
    y_pred = kMean.predict(X_norm)
    if i == 1:
        internal_metrics.append(0.0)
    else:
        internal_metrics.append(silhouette(X_norm, y_pred, centroids))

    external_metrics.append(adjusted_rand_index(y, y_pred))
    display_clusters(y_pred, str(i) + ' кластеров')

display_metrics(range(1, 11), external_metrics, 'Внешняя метрика')
display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
コード例 #22
0
from KMeans import KMeans
from sklearn.cluster import KMeans as km
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import time
reduced_data, check = make_blobs(n_samples=1000,
                                 n_features=2,
                                 centers=3,
                                 cluster_std=7)
start_time = time.time()
kmeans = KMeans(n_cluster=3, total_iter=300)
kmeans.fit(reduced_data)
pred = kmeans.predict(reduced_data)
print(time.time() - start_time)
plt.figure(1)
for i in range(0, len(pred)):
    if pred[i] == 0:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="b", alpha=.5)
    if pred[i] == 1:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="g", alpha=.5)
    if pred[i] == 2:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="y", alpha=.5)
plt.scatter(kmeans.centroids[0, 0], kmeans.centroids[0, 1], marker="*",
            c="r")  #wrong kmenas good plots
plt.scatter(kmeans.centroids[1, 0], kmeans.centroids[1, 1], marker="*", c="r")
plt.scatter(kmeans.centroids[2, 0], kmeans.centroids[2, 1], marker="*", c="r")
start_time = time.time()
kmeans = km(n_clusters=3)
kmeans.fit(reduced_data)
pred = kmeans.predict(reduced_data)
コード例 #23
0
    # Set this to what attribute list you want to use
    if preset == "Regular":
        attributeList = bigAttributes
    elif preset == "Wings":
        attributeList = wingAttributes

    X = dataBase.makePlayersList(attributeList)
    print("Working with", len(X), "players with", len(attributeList),
          "attributes each.")
    kX = X.copy()
    bkX = X.copy()

    # Clustering with KMeans
    kmeans = KMeans()
    kmeans.setK(clusters)
    kmeans.fit(kX)
    pred = kmeans.predict(kX)

    # Make plot for KMeans

    # Convert data points to 2D points for plotting
    pca = PCA(n_components=2)
    kX = pca.fit_transform(kX)

    # Make labels based on player position
    """
    labels for regular:
        0 - goalkeepers
        1 - defenders
        2 - midfielders
        3 - forwards
コード例 #24
0
ファイル: main.py プロジェクト: andrew950468/CS289A
from scipy import io

############# FILE STUFF ############# 
trainFileMNIST = "./mnist_data/images.mat"
    
trainMatrix = io.loadmat(trainFileMNIST)  # Dictionary

############# GET DATA ############# 
print 20 * "#", "Getting Data", 20 * "#"
imageData = np.array(trainMatrix['images'])
imageData = np.rollaxis(imageData, 2, 0)  # move the index axis to be the first 

dataShape = np.shape(imageData)
print "Image Data Shape", dataShape

imageDataFlat = []
for elem in imageData:
    imageDataFlat.append(elem.flatten())

dataShape = np.shape(imageDataFlat)
print "Image Data Flat Shape", dataShape

num_clusters = [5, 10, 20]

for cluster in num_clusters:
    print 20 * "#", "Num Clusters:", cluster, 20 * "#"
    KM = KMeans(cluster, max_iter=10)
    KM.fit(imageDataFlat)
    visualize(KM.cluster_centers_, cluster)
    
    
コード例 #25
0
 def k_means(self, data, k=3):
     ''' Runs k means algorithm on data, and returns the clustered data '''
     kmeans = KMeans(k)
     kmeans.fit(data)
     return kmeans.classification_names
コード例 #26
0
from KMeans import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy as np

X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=3,
                  center_box=(-15, 15))

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
prediction = kmeans.predict(X)
loss = kmeans.loss
plt.figure(1)
plt.plot(range(len(loss)), loss)
plt.figure(2)
plt.scatter(X[:, 0], X[:, 1], c=prediction)
plt.figure(3)
test = np.random.uniform(-15, 15, size=(5000, 2))
test_prediction = kmeans.predict(test)
plt.scatter(test[:, 0], test[:, 1], c=test_prediction)
plt.show()
コード例 #27
0
ファイル: assignment3.py プロジェクト: remziorak/K-means
# Name of the txt files
file1 = 'data1.txt'
file2 = 'data2.txt'
file3 = 'data3.txt'


# load datasets from files
dataset1 = pd.read_csv(file1, sep=',', header=None)
dataset2 = pd.read_csv(file2, sep=',', header=None)
dataset3 = pd.read_csv(file3, sep=',', header=None)


"=====================  K-Means for Dataset1: k=3, k=7 ====================="
kmeans1 = KMeans(n_cluster=3, random_state=721)
kmeans1.fit(dataset1)
kmeans1.save_figures(outpaths.outpath1)
kmeans1.create_gif(outpaths.outpath1)

kmeans2 = KMeans(n_cluster=7, random_state=721)
kmeans2.fit(dataset1)
kmeans2.save_figures(outpaths.outpath2)
kmeans2.create_gif(outpaths.outpath2)

"=====================  K-Means for Dataset2: k=2, k=5 ====================="
kmeans3 = KMeans(n_cluster=2, random_state=721)
kmeans3.fit(dataset2)
kmeans3.save_figures(outpaths.outpath3)
kmeans3.create_gif(outpaths.outpath3)

kmeans4 = KMeans(n_cluster=5, random_state=721)
コード例 #28
0
ファイル: Phase_3.py プロジェクト: Surya97/MWDB-project
    label_features = LabelFeatures(
        labelled_dataset_path=labelled_dataset_path,
        unlabelled_dataset_path=unlabelled_dataset_path,
        feature_name='SIFT',
        decomposition_name='')
    label_features.set_features()
    dorsal_features = label_features.get_label_features('dorsal')
    palmar_features = label_features.get_label_features('palmar')
    unlabelled_features = label_features.get_unlabelled_images_decomposed_features(
    )
    print('Computing clusters associated with dorsal-hand images...')
    temp_dictionary = list(dorsal_features.items())
    np.random.seed(23)
    np.random.shuffle(temp_dictionary)
    dorsal_features = dict(temp_dictionary)
    kmeans.fit(dorsal_features)

    # Visualizing dorsal image clusters
    dorsal_image_cluster_map = kmeans.get_image_cluster_map()
    dorsal_cluster_visualization = VisualizeClusters(dorsal_features,
                                                     dorsal_image_cluster_map,
                                                     'dorsal')
    dorsal_cluster_visualization.plot()

    similarity_val1 = kmeans.get_similarity_val(
        labelled_dataset_features=dorsal_features,
        unlabelled_dataset_features=unlabelled_features)

    print('Computing clusters associated with palmar-hand images...')
    temp_dictionary = list(palmar_features.items())
    np.random.shuffle(temp_dictionary)
コード例 #29
0
import pandas as pd
from KMeans import KMeans
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat

mat = loadmat('DataSets/KMeans_PCA/ex7data2.mat')
data = mat['X']

model = KMeans()
model.fit(data)

colors = ["r", "g", "c"]

for classification in model.classes:
	color = colors[classification]
	for sample in model.classes[classification]:
		plt.scatter(sample[0], sample[1], color = color,s = 30)

for centroid in model.centroids:
	plt.scatter(centroid[0], centroid[1], s = 130, marker = "x", color='black')
plt.show()

コード例 #30
0
ファイル: main.py プロジェクト: deborrrrrah/Clustering
sk_agglo_accuracy_complete = 0
sk_agglo_accuracy_average = 0
dbscan_accuracy = 0
sk_dbscan_accuracy = 0

print ('=== ACCURACY FROM PREDICT ===')
print ()

k = 0
for train_index, test_index in kf.split(X, y):
    print (str(k) + '-fold')
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]

    # KMeans 
    kmeans.fit(np.asarray(X_train))
    result = kmeans.predict(np.asarray(X_test))
    accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(result))
    kmeans_accuracy += accuracy
    print ('KMeans')
    print ('Accuracy\t', accuracy)
    print ('Format {Real class : cluster}')
    print ('Dict\t\t', str(dict))
    print ()

    sk_kmeans.fit(X_train)
    sk_result = sk_kmeans.predict(X_test)
    accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(sk_result))
    sk_kmeans_accuracy += accuracy
    print ('Sklearn KMeans')
    print ('Accuracy\t', accuracy)
コード例 #31
0
    data /= std
    return data.values


if __name__ == "__main__":
    #######################################
    # KMEANS
    #######################################
    X = load_kmeans_data()
    #    instantiate KMeans class
    k_means = KMeans(K_clusters=4,
                     threshold=0.001,
                     n_iters=1000,
                     initialization="forgy")
    #    kmeans training
    k_means.fit(X)
    k_means.plot_training_history(X)
    #######################################
    # Simple Linear Regression
    #######################################
    # 1 - chargement du dataset
    data = pd.read_table(os.path.join(DATA_PATH, "data.txt"),
                         sep="\t",
                         header=None)
    x = np.array(data[0])
    y = np.array(data[1])
    # 2 - regression descente de gradient
    lin_reg_grad = LinReg(method="gradient_descent")
    # train liner regression model gradient descent
    lin_reg_grad.fit(x, y)
    print("linear_regression", lin_reg_grad.coefs)
コード例 #32
0
class GMM():
    def __init__(self, initializer='support', cov_type='full'):
        assert initializer in [
            'support', 'uniform'
        ], 'Please select initialization scheme as support or uniform'
        assert cov_type in [
            'full', 'tied', 'diag', 'spherical'
        ], 'Please select covariance type as full, tied, diag, or spherical'
        self.kmeans_cls_ = KMeans()
        self.means_ = None
        self.cov_ = None
        self.mixture_weights_ = None
        self.membership_weights_ = None
        self.k_ = None
        self.ll_graph_ = []
        self.initializer_ = initializer
        self.cov_type_ = cov_type

    def fit(self, X, k, tol_=1e-6):
        self.k_ = k
        self.initialize(X)
        new_ll = self.get_log_likelihood(X)
        old_ll = new_ll - tol_ * 10
        while old_ll - new_ll < -tol_:
            self.ll_graph_.append(new_ll)
            self.gaussian_probabilities_multiple(X, normalized=True)
            self.update_mixture_weights()
            self.update_means(X)
            self.update_var(X)
            old_ll = new_ll
            new_ll = self.get_log_likelihood(X)

    def get_cov_from_init(self, X, predictions):
        if self.cov_type_ == 'full':
            return np.array(
                [np.cov(X[predictions == k].T) for k in range(self.k_)])
        if self.cov_type_ == 'tied':
            pass
        if self.cov_type_ == 'diag':
            return np.array([
                (1 / (len(X[predictions == k]) - 1)) *
                np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2)
                for k in range(self.k_)
            ])
        if self.cov_type_ == 'spherical':
            return np.repeat(np.mean(np.array([
                (1 / (len(X[predictions == k]) - 1)) *
                np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2)
                for k in range(self.k_)
            ]),
                                     axis=-1)[:, np.newaxis],
                             X.shape[-1],
                             axis=1)

    def initialize(self, X):
        self.kmeans_cls_.fit(X, self.k_)
        predictions_ = self.kmeans_cls_.predict(X)
        self.means_ = self.kmeans_cls_.means_
        self.cov_ = self.get_cov_from_init(X, predictions_)
        if self.initializer_ == 'support':
            self.mixture_weights_ = np.array([
                sum(predictions_ == k) / len(predictions_)
                for k in range(self.k_)
            ])
        if self.initializer_ == 'uniform':
            self.mixture_weights_ = (np.array([1] * self.k_)) / self.k_

    def gaussian_probabilities_multiple(self, X, normalized=True):
        d = X.shape[-1]

        input_ = X[:, None, :]
        if self.cov_type_ in ['full', 'tied']:
            exp_part = -0.5 * np.einsum(
                'ijk,jkl,ijl->ij', input_ - self.means_,
                np.array(list(map(np.linalg.inv, self.cov_))),
                input_ - self.means_)
            output = (1 / ((2 * np.pi)**(d / 2) * np.array(
                list(map(lambda x: np.linalg.det(x)**
                         (1 / 2), self.cov_)))))[None, :] * np.exp(exp_part)
        elif self.cov_type_ in ['diag', 'spherical']:
            exp_part = -0.5 * np.einsum('ijk,jk->ij',
                                        (input_ - self.means_[None, :, :])**2,
                                        1 / self.cov_)
            output = (1 / ((2 * np.pi)**(d / 2) * np.prod(self.cov_, axis=1)**
                           (1 / 2)))[None, :] * np.exp(exp_part)

        if normalized:
            output = np.einsum('ij,j->ij', output, self.mixture_weights_)
            output = output / np.sum(output, axis=1, keepdims=True)
            self.membership_weights_ = output
        else:
            return output

    def update_mixture_weights(self):
        self.mixture_weights_ = np.einsum(
            'ij->j',
            self.membership_weights_) / self.membership_weights_.shape[0]

    def update_means(self, X):
        self.means_ = np.einsum('id,ik->kd', X,
                                self.membership_weights_) / np.einsum(
                                    'ik->k', self.membership_weights_)[:, None]

    def update_var(self, X):
        input_ = X[:, None, :]
        if self.cov_type_ in ['full', 'tied']:
            self.cov_ = np.einsum(
                'ij,ijk,ijl->jlk', self.membership_weights_,
                input_ - self.means_, input_ - self.means_) / np.einsum(
                    'ik->k', self.membership_weights_)[:, None, None]
        elif self.cov_type_ in ['diag', 'spherical']:
            self.cov_ = (np.einsum('ij,ilk,ilk->jk', self.membership_weights_,
                                   input_, input_) - 2 *
                         np.einsum('ij,ilk,jk->jk', self.membership_weights_,
                                   input_, self.means_)) / np.einsum(
                                       'ik->k', self.membership_weights_
                                   )[:, None] + self.means_**2

    def get_log_likelihood(self, X):
        output = self.gaussian_probabilities_multiple(X, normalized=False)
        output = np.log(np.einsum('ij,j->i', output, self.mixture_weights_))
        return np.einsum('i->', output)
コード例 #33
0
        [1.9, 3],
        [1, 2.7],
        [1.9, 3],
        [1, 2.7],
        [1.9, 2.4],
        [0.8, 2],
        [1.6, 1.8],
        [1, 1]
        ]


    print("\n** Exercise 1 Dataset**")

    km = KMeans(3, data_t)

    km.fit()

    print("Centroids: \n")
    km.print_centroids()

    plt.scatter(x=[v[0] for v in km.centroids], y=[v[1] for v in km.centroids], c=['red', 'blue', 'green'])
    plt.scatter(x=[d[0] for d in data_t], y=[d[1] for d in data_t])
    plt.title("K-Means Exercise 1")

#######

    print("\n ** Iris Dataset **")

    data_iris = datasets.load_iris(return_X_y=True)[0]

    print("\n K=3")