def new_centroids(data,centroids,clusters,clusters_id,num_of_iterations,num_of_clusters,iteration_count):
    #new_centroid = [[float(0) for _ in range(centroids.shape[1])] for _ in range(num_of_clusters)]
    new_centroid = []

    for i in range(len(clusters)):
        val = np.array(clusters[i])
        new_centroid.append((np.sum(val,0))/len(clusters[i]))
        
    new_centroid = np.array(new_centroid)
    sums = np.sum(np.array(centroids)-np.array(new_centroid))
    d = dict()
    if(sums==0 or iteration_count==iterations):
        print("Converged")
        for x in range(len(clusters_id)):
            for y in range(len(clusters_id[x])):
                d[clusters_id[x][y]] = x+1
        
        vals = [d[x] for x in sorted(d.keys())]
        ground_truth = list(map(int,data[:,1]))

        print("Jaccard")
        ja = helpers.jaccard(ground_truth,vals)
        print(ja)
        
        print("Rand index")
        rd = helpers.rand(ground_truth,vals)
        print(rd)
        
        unique_predicted = list(set(vals))
        new_x = helpers.pca(data[:,2:])
        helpers.scatter(new_x[:,0],new_x[:,1],vals,unique_predicted,"K-means Algorithm","iyer.txt")
    else:
        kmeans(data,new_centroid,iterations,no_cluster,iteration_count)
Esempio n. 2
0
def bruteforce(X):
    distances = {}
    for i, x in enumerate(X):
        for j, y in enumerate(X):
            distances.setdefault(i, []).append((j, jaccard(x, y)))

    for k in distances:
        distances[k].sort(key=lambda x: -x[1])

    return distances
Esempio n. 3
0
def greene(step_communities, similarity=0.5, death=3):
    dynamic = []

    for community in step_communities[0]:
        dynamic.append(DynamicCommunity(community))

    del step_communities[0]

    for i, step in enumerate(step_communities):
        to_add = []

        # match communities to fronts
        for d_idx, d in enumerate(dynamic):
            if d.is_dead() is None:

                for c in step:
                    if helpers.jaccard(dynamic[d_idx].get_front()[0], c) > similarity:
                        if dynamic[d_idx].get_front()[1] < i+1:  # update the front
                            dynamic[d_idx].add_community(c, i+1)
                        else:  # create a split community
                            split = DynamicCommunity(c, i+1, dynamic[d_idx].get_timeline())
                            split.define_split(d_idx, i+1)
                            to_add.append(split)
                        dynamic[d_idx].observed()

                # kill inactive communities
                if dynamic[d_idx].get_front()[1] < i+1 and dynamic[d_idx].unobserved() > death:
                    dynamic[d_idx].kill(i+1)

        # create dynamic communities for unmatched communities
        for c in step:
            matched = False
            for d in dynamic:
                if d.is_dead() is None and helpers.jaccard(d.get_front()[0], c) == 1:
                    matched = True
                    break
            if not matched:
                to_add.append(DynamicCommunity(c, i+1))

        dynamic.extend(to_add)

    return dynamic
def new_centroids(reducedSpace, centroids, clusters, clusters_id,
                  num_of_iterations, num_clusters, iteration_count):
    new_centroid = []

    for i in range(len(clusters)):
        val = np.array(clusters[i])
        new_centroid.append((np.sum(val, 0)) / len(clusters[i]))

    new_centroid = np.array(new_centroid)
    sums = np.sum(np.array(centroids) - np.array(new_centroid))
    # sums = 0
    d = dict()
    if (sums == 0 or iteration_count == iterations):
        print("Converged")
        for x in range(len(clusters_id)):
            for y in range(len(clusters_id[x])):
                d[clusters_id[x][y]] = x + 1

        vals = [d[x] for x in sorted(d.keys())]
        vals = np.array(vals)
        print(vals)
        print(set(vals))
        # ground_truth = list(map(int,GeneExpressions[:,1]))

        print("Jaccard")
        ja = helpers.jaccard(Groundtruth, vals)
        print(ja)

        print("Rand index")
        rd = helpers.rand(Groundtruth, vals)
        print(rd)

        unique_predicted = list(set(vals))
        new_x = helpers.pca(GeneExpressions)
        helpers.scatter(new_x[:, 0], new_x[:, 1], vals, unique_predicted)

    else:
        kmeans(reducedSpace, new_centroid, iterations, num_clusters,
               iteration_count)
Esempio n. 5
0
def experiment(hashes, estimation, X, other_data):
    X_hashes = [h.hash(X) for h in hashes]
    for Y in other_data:
        Y_hashes = [h.hash(Y) for h in hashes]
        yield (jaccard(X, Y), estimation(X_hashes, Y_hashes))
Esempio n. 6
0
epsilon = float(input("Enter epsilon value: "))

min_pts = float(input("Enter min_pts value: "))

# 3. Perform DBSCAN
model = __dbscan.DBSCAN(X, epsilon, min_pts)
predicted = model.fit()
unique, counts = np.unique(predicted, return_counts=True)
print("Counts by cluster:")
for key, value in zip(unique, counts):
    print("{}: {}".format(key, value))

# 4. Find Rand index and Jaccard

rand_score = helpers.rand(y, predicted)
jaccard_score = helpers.jaccard(y, predicted)
unique_predicted = list(set(predicted))
print(predicted)
print(rand_score)
print(jaccard_score)

# print(adjusted_rand_score(y, predicted))
# print(jaccard_similarity_score(y, predicted))

# 5. Visualize using PCA
new_X = X
if X.shape[1] > 2:
    new_X = helpers.pca(X)

helpers.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted,
                "DBSCAN", file_name)
# Testing data
#distMatrix = [[0.00,0.71,5.66,3.61,4.24,3.20],[0.71,0.00,4.95,2.92,3.54,2.50],[5.66,4.95,0.00,2.24,1.41,2.50],[3.61,2.92,2.24,0.00,1.00,0.50],[4.24,3.54,1.41,1.00,0.00,1.12],[3.20,2.50,2.50,0.50,1.12,0.00]]
#rowsnumbers = [[0],[1],[2],[3],[4],[5]]
distMatrix = np.array(distMatrix)
#print(distMatrix)
while (len(distMatrix) >= 2):
    if (len(rowsnumbers) == k):
        break
    distMatrix, rowsnumbers = updateDistMatrix(distMatrix, rowsnumbers)
print(rowsnumbers)

clusterassignments = {}

cluster = 1
for i in rowsnumbers:
    for j in i:
        clusterassignments[j + 1] = cluster
    cluster += 1
sorted(clusterassignments)
sorted(Groundtruth)

predicted = list(clusterassignments.values())
unique_predicted = list(set(predicted))
Ground = list(Groundtruth.values())
print(h.jaccard(Ground, predicted))
print(h.rand(Ground, predicted))

new_X = h.pca(GeneExpressions)
h.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted)