def template_cluster_allocation(input_data, cluster_sizes, number_cluster, number_represent_points = 5, compression = 0.5, ccore_flag = False, **kwargs): if isinstance(input_data, str): sample = read_sample(input_data) else: sample = input_data numpy_usage = kwargs.get('numpy_usage', False) if numpy_usage is True: sample = numpy.array(sample) cure_instance = cure(sample, number_cluster, number_represent_points, compression, ccore = ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() assertion.eq(len(clusters), number_cluster) assertion.eq(len(representors), number_cluster) assertion.eq(len(means), number_cluster) obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) assertion.eq(total_length, len(sample)) cluster_sizes.sort() obtained_cluster_sizes.sort() assertion.eq(cluster_sizes, obtained_cluster_sizes)
def template_cluster_allocation(path, cluster_sizes, number_cluster, number_represent_points=5, compression=0.5, ccore_flag=False): sample = read_sample(path) cure_instance = cure(sample, number_cluster, number_represent_points, compression, ccore=ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() assert len(clusters) == number_cluster assert len(representors) == number_cluster assert len(means) == number_cluster obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) assert total_length == len(sample) cluster_sizes.sort() obtained_cluster_sizes.sort() assert cluster_sizes == obtained_cluster_sizes
def cureAlgo(filename, col_name): df = pd.read_csv(filename, usecols=[col_name]) df[col_name] = df[col_name] data = df[col_name] rownumber = len(data) if rownumber % 2 == 1: rownumber += 1 #converting pandas series into ndarray input_data = np.asarray(data) input_data.shape = (rownumber // 2, 2) print(input_data) print(input_data.shape) print( "----------------------------------------------------------------------------------------------------------------------" ) # Allocate three clusters: cure_instance = cure(input_data.tolist(), 10) cure_instance.process() clusters = cure_instance.get_clusters() print(clusters) print(timeit.timeit('"-".join(str(n) for n in range(100))', number=10000)) # Visualize clusters: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, None) visualizer.show(display=False) plt.savefig( "C:/Users/Nupura Hajare/Desktop/flask_app/web/static/img/CURE.png")
def template_clustering(number_clusters, path, number_represent_points=5, compression=0.5, draw=True, ccore_flag=False): sample = read_sample(path) cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag) (ticks, _) = timedcall(cure_instance.process) clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") if (draw is True): visualizer = cluster_visualizer() if (ccore_flag is True): visualizer.append_clusters(clusters, sample) else: visualizer.append_clusters(clusters, None) visualizer.append_clusters(representors, marker='*', markersize=10) visualizer.append_clusters([means], None, marker='o') visualizer.show()
def get_cure_clusters(data, count_clusters=3): rows = data.getRows() input_data = list() result_clusters = list() for row in rows: input_data.append(row.getDataArray()) SST = calculate_sst(input_data) cure_instance = cure(input_data, count_clusters) cure_instance.process() clusters = cure_instance.get_clusters() colorRange = Constants.DEFAULT_COLOR_SET SSB = 0 SSW = 0 for i, cluster in enumerate(clusters): SSW = SSW + calculate_ssw(cluster) result_cluster = Cluster(CureWindow.get_rows(data, cluster)) colour = random.choice(colorRange) result_cluster.setName(colour) result_cluster.setColor(colour) result_clusters.append(result_cluster) SSB = calculate_ssb(SST, SSW) RS_RESULT.append(SSB / SST) print(RS_RESULT) return result_clusters
def template_clustering(number_clusters, path, number_represent_points=5, compression=0.5, draw=True, ccore_flag=False): sample = read_sample(path) cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag) (ticks, _) = timedcall(cure_instance.process) clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") #print([len(cluster) for cluster in clusters]) if draw is True: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute(0, cluster_index, representors[cluster_index], '*', 10) visualizer.append_cluster_attribute(0, cluster_index, [means[cluster_index]], 'o') visualizer.show()
def cure_func(data, k): data = DataFrame(data) data = data.apply(pd.to_numeric) X = data.to_numpy() cure_instance = cure(X, int(k)) cure_instance.process() clusters = cure_instance.get_clusters() return clusters
def templateClusterAllocationOneDimensionData(ccore_flag): input_data = [ [random()] for _ in range(10) ] + [ [random() + 3] for _ in range(10) ] + [ [random() + 5] for _ in range(10) ] + [ [random() + 8] for _ in range(10) ] cure_instance = cure(input_data, 4, ccore = ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() assertion.eq(4, len(clusters)) for cluster in clusters: assertion.eq(10, len(cluster))
def templateClusterAllocationOneDimensionData(self, ccore_flag): input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]; cure_instance = cure(input_data, 4, ccore = ccore_flag); cure_instance.process(); clusters = cure_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def get_modelo(self, algoritmo, eps, neig): print(algoritmo + ' ' + str(eps) + ' - ' + str(neig)) instance = None if algoritmo == 'AGNES': instance = agglomerative(self.amostras, self.numero_clusters, link=None) elif algoritmo == 'BIRCH': instance = birch(self.amostras, self.numero_clusters, entry_size_limit=10000) elif algoritmo == 'CLARANS': instance = clarans(self.amostras, self.numero_clusters, numlocal=100, maxneighbor=1) elif algoritmo == 'CURE': instance = cure(self.amostras, self.numero_clusters, number_represent_points=5, compression=0.5) elif algoritmo == 'DBSCAN': instance = dbscan(self.amostras, eps=eps, neighbors=neig) elif algoritmo == 'FCM': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = fcm(self.amostras, initial_centers) elif algoritmo == 'KMEANS': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = kmeans(self.amostras, initial_centers, tolerance=0.001) elif algoritmo == 'KMEDOIDS': instance = kmedoids(self.amostras, initial_index_medoids=[0, 0, 0, 0, 0, 0, 0], tolerance=0.0001) #ajustar o n_de cluster elif algoritmo == 'OPTICS': instance = optics(self.amostras, eps=eps, minpts=neig) elif algoritmo == 'ROCK': instance = rock(self.amostras, eps=eps, number_clusters=self.numero_clusters, threshold=0.5) else: pass instance.process() lista_agrupada = self.get_lista_agrupada(instance.get_clusters()) lista_agrupada = np.array(lista_agrupada) if (neig != 0): n_grupos = len(np.unique(lista_agrupada)) if n_grupos > self.numero_clusters: lista_agrupada = self.get_modelo(algoritmo, eps, neig + 1) return lista_agrupada
def template_clustering(number_clusters, path, number_represent_points = 5, compression = 0.5, draw = True, ccore_flag = False): sample = read_sample(path); cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag); (ticks, result) = timedcall(cure_instance.process); clusters = cure_instance.get_clusters(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); if (draw is True): if (ccore_flag is True): draw_clusters(sample, clusters); else: draw_clusters(None, clusters);
def template_cluster_allocation(self, path, cluster_sizes, number_cluster, number_represent_points = 5, compression = 0.5, ccore_flag = False): sample = read_sample(path); cure_instance = cure(sample, number_cluster, ccore = ccore_flag); cure_instance.process(); clusters = cure_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; total_length = sum(obtained_cluster_sizes); assert total_length == len(sample); cluster_sizes.sort(); obtained_cluster_sizes.sort(); assert cluster_sizes == obtained_cluster_sizes;
def templateEncoderProcedures(ccore_flag): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) cure_instance = cure(sample, 4, 5, 0.5, ccore=ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assert 4 == len(clusters)
def templateEncoderProcedures(ccore_flag): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) cure_instance = cure(sample, 4, 5, 0.5, ccore = ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assertion.eq(4, len(clusters))
def testVisualizeClusterWithAttributes(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); cure_instance = cure(sample, 2, 5, 0.5, False); cure_instance.process(); clusters = cure_instance.get_clusters(); representors = cure_instance.get_representors(); means = cure_instance.get_means(); visualizer = cluster_visualizer(); visualizer.append_clusters(clusters, sample); for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute(0, cluster_index, representors[cluster_index], '*', 10); visualizer.append_cluster_attribute(0, cluster_index, [ means[cluster_index] ], 'o'); visualizer.show();
def exception(type, input_data, number_cluster, number_represent_points, compression, ccore_flag): try: if isinstance(input_data, str): sample = read_sample(input_data) else: sample = input_data cure_instance = cure(sample, number_cluster, number_represent_points, compression, ccore=ccore_flag) cure_instance.process() except type: return except Exception as ex: raise AssertionError("Expected: '%s', Actual: '%s'" % (type, type(ex).__name__)) raise AssertionError("Expected: '%s', Actual: 'None'" % type)
def runCURE(self, k, X): cluster_points = {} for q in range(k): cluster_points[q] = list() cure_instance = cure(data=X, number_cluster=k) cure_instance.process() clusters = cure_instance.get_clusters() for id_point in range(len(X)): for cluster_id in range(len(clusters)): point_ids_in_cluster = [ int(point_id_in_cluster) for point_id_in_cluster in clusters[cluster_id] ] if (id_point in point_ids_in_cluster): cluster_points[cluster_id].append(X[id_point]) return cluster_points
def template_clustering(number_clusters, path, number_represent_points=5, compression=0.5, draw=True, ccore_flag=False): sample = read_sample(path) cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag) (ticks, result) = timedcall(cure_instance.process) clusters = cure_instance.get_clusters() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") if (draw is True): if (ccore_flag is True): draw_clusters(sample, clusters) else: draw_clusters(None, clusters)
def r_python_cure_iterface(filepath): samples = [] with open(filepath, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=",") count = 0 for row in csvreader: if count != 0: sample = row sample = [float(x) for x in sample] samples.append(sample) count = count + 1 cure_instance = cure(samples, 8, number_represent_points=8, compression=0.25) cure_instance.process() clusters = cure_instance.get_clusters() return clusters
def testVisualizeClusterWithAttributesNumpy(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, return_type='numpy') cure_instance = cure(sample, 2, 5, 0.5, False) cure_instance.process() clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute( 0, cluster_index, numpy.array(representors[cluster_index]), '*', 10) visualizer.append_cluster_attribute( 0, cluster_index, numpy.array([means[cluster_index]]), 'o') visualizer.show()
def cure_clustering(k=-1): """Perform CURE clustering algorithm.""" if k == -1: begin = 5 end = 1001 else: begin = k end = k + 1 input_data = read_sample("results/pca_result.txt") for k in range(begin, end, 5): print(str(k) + " clusters started for CURE") cure_instance = cure(input_data, k) cure_instance.process() cure_clusters = cure_instance.get_clusters() print(str(k) + " clusters completed for CURE") results_file = open('results/cure_' + str(k) + '.pickle', 'wb') pickle.dump(cure_clusters, results_file) results_file.close()
def template_clustering(number_clusters, path, number_represent_points=5, compression=0.5, draw=True, ccore_flag=True): sample = read_sample(path) cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag) (ticks, _) = timedcall(cure_instance.process) clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") #print([len(cluster) for cluster in clusters]) if draw is True: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute(0, cluster_index, representors[cluster_index], '*', 10) visualizer.append_cluster_attribute(0, cluster_index, [ means[cluster_index] ], 'o') visualizer.show()
model = Doc2Vec.load('models/%s.d2v' % i) plt.figure() # convert sequence to array docvecs = [] for num in range(len(model.docvecs)): # print(num) # print(model.docvecs[num]) docvecs.append(np.array(model.docvecs[num])) for Rpoint in Parameter.represent_point: silhouette_scores = [] calinski_scores = [] for index in Parameter.K: cure_model = cure(docvecs, index, number_represent_points=Rpoint) cure_model.process() clusters = cure_model.get_clusters() labels = [1] * len(docvecs) for ind in range(len(clusters)): for element in clusters[ind]: labels[element] = ind print("Performance with threshold %d:" % i) silhouette_scores.append(metrics.silhouette_score(docvecs, labels)) calinski_scores.append(metrics.calinski_harabaz_score(docvecs, labels)) plt.subplot(1, 2, 1) plt.plot(Parameter.K, silhouette_scores, label=str(Rpoint)) plt.legend()
from pyclustering.cluster import cluster_visualizer from pyclustering.cluster.cure import cure from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ]. input_data = read_sample(FCPS_SAMPLES.SAMPLE_CHAINLINK) lines = open("t4.8k", "r") inp = [] for line in lines: cords = line.split() if len(cords) != 2: continue inp.append([float(cords[0]), float(cords[1])]) # Allocate clusters. cure_instance = cure(inp, 6) cure_instance.process() clusters = cure_instance.get_clusters() # Visualize allocated clusters. visualizer = cluster_visualizer() visualizer.append_clusters(clusters, inp) visualizer.show()
if True: data = np.loadtxt(args.i) data = np.delete(data, 0, 1) # data = preprocessing.normalize(data, norm = 'max', axis = 0); # clus = cluster.AgglomerativeClustering(n_clusters = args.n) clusK = cluster.KMeans(n_clusters = args.n, init = 'k-means++', n_init = 1, verbose = args.d, tol = args.t, copy_x = False, algorithm = 'elkan') # clusK = cluster.BisectingKMeans(n_clusters = args.n, init = 'k-means++', n_init = 1, verbose = args.d, tol = args.t, copy_x = False, algorithm = 'elkan') clus = cluster.Birch(n_clusters = clusK, copy = False, threshold = args.t, branching_factor = 50) # clus = cluster.SpectralClustering(n_clusters = args.n, assign_labels = 'discretize', affinity = 'nearest_neighbors', n_neighbors = 15, random_state = 42, eigen_tol = args.t) clus.fit(data) np.savetxt(args.i + '.membership', clus.labels_, fmt = '%d') else: data = read_sample(args.i) pyc = cure(data = data, number_cluster = args.n); pyc.process(); clusters = pyc.get_clusters(); print(clusters) points_clusters = [0] * len(data) for i, clus in enumerate(clusters): for c in clus: points_clusters[c] = i np.savetxt(args.i + '.membership', points_clusters, fmt = '%d')
# In[23]: import pyclustering from pyclustering.cluster import cluster_visualizer from pyclustering.cluster.cure import cure from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES # Input data in following format [ [0.1, 0.5], [0.3, 0.1], ... ]. #input_data = read_sample(FCPS_SAMPLES.SAMPLE_LSUN); # Allocate three clusters. X=finalDataFrame.iloc[:,[0,1]].to_numpy() cure_instance = cure(X, 5) cure_instance.process() clusters = cure_instance.get_clusters() clusters # In[24]: # Visualize allocated clusters. visualizer = cluster_visualizer() visualizer.append_clusters(clusters, X) visualizer.show()
[15, 12], [43, 67], [45, 56], [63, 54], [49, 50], [24, 10], [30, 30], [85, 70], [71, 80], [60, 78], [70, 55], [80, 91], ]) # cure_instance = cure(sample, number_clusters, number_represent_points, compression, ccore_flag) cure_instance = cure(X, 3) cure_instance.process() clusters = cure_instance.get_clusters() print(clusters) representors = cure_instance.get_representors() means = cure_instance.get_means() print("Sample: ", X) visualizer = cluster_visualizer() visualizer.append_clusters(clusters, X) for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute(0, cluster_index, representors[cluster_index], '*', 10)
def process_cure(sample): instance = cure(sample, NUMBER_CLUSTERS) (ticks, _) = timedcall(instance.process) return ticks
plt.show() # Gaussian Mixture y_pred = GaussianMixture(n_components=k).fit(X).predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.title("Gaussian Mixture") plt.show() # Spectral Clustering y_pred = SpectralClustering(n_clusters=k).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.title("Spectral Clustering") plt.show() # CURE cure_instance = cure(data=X, number_cluster=k); cure_instance.process(); clusters = cure_instance.get_clusters(); visualizer = cluster_visualizer(titles=["Cure"]); visualizer.append_clusters(clusters, X); visualizer.show(); # CLARANS clarans_instance = clarans(data=X, number_clusters=k, numlocal=5, maxneighbor=5); clarans_instance.process(); clusters = clarans_instance.get_clusters(); visualizer = cluster_visualizer(titles=["Clarans"]); visualizer.append_clusters(clusters, X); visualizer.show(); # Agglomerative
Created on Mar 22, 2017 @author: arno Experiment the cure clustering algorithm on the song 2D vector (diversity, size) *** Requires wordcount.py from wordcount package to be run beforehand *** ''' from pyclustering.cluster.cure import cure from pyclustering.cluster import cluster_visualizer from pyclustering.utils import read_sample SONG_VECTORS_FILE = "../wordcount/output/song_vectors_pyclustering_regular.txt" # read data for clustering from some file input_data = read_sample(SONG_VECTORS_FILE) # create instance of cure algorithm for cluster analysis cure_instance = cure(input_data, 5, 8, 0.7, False) # run cluster analysis cure_instance.process() # get results of clustering clusters = cure_instance.get_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters) visualizer.show()
ww = silhouette_avgs.argmax() k = ks[ww] km = cluster.KMeans(n_clusters=k, random_state=42).fit(df) tags = km.labels_ rds.set('tags_k-mean', pickle.dumps(tags)) rds.set('score_k-mean', silhouette_avgs[ww]) ################################################################################### ############################# CURE ################################## ################################################################################### data = df.as_matrix() silhouette_avg = [] ks = range(5, 51) for k in ks: print('k =', k) cure_instance = cure(data, k, number_represent_points=5, compression=0.5) cure_instance.process() tags_index = cure_instance.get_clusters() tags = np.arange(len(data)) for i, index in enumerate(tags_index): tags[index] = i silhouette_avg.append(metrics.silhouette_score(data, tags)) silhouette_avg = np.array(silhouette_avg) ww = silhouette_avg.argmax() k = ks[ww] cure_instance = cure(data, k, number_represent_points=5, compression=0.5) cure_instance.process() tags_index = cure_instance.get_clusters() tags = np.arange(len(data))
def testCoreInterfaceIntInputData(self): cure_instance = cure([[1], [2], [3], [20], [21], [22]], 2, ccore=True) cure_instance.process() assert len(cure_instance.get_clusters()) == 2
def testCoreInterfaceIntInputData(self): cure_instance = cure([ [1], [2], [3], [20], [21], [22] ], 2, ccore = True) cure_instance.process() assert len(cure_instance.get_clusters()) == 2;
docvecs = [] for num in range(len(model.docvecs)): # print(num) # print(model.docvecs[num]) docvecs.append(np.array(model.docvecs[num])) index = [i for i in range(3, 50)] compression_index = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7] for compression in compression_index: all_silhouette_scores = [] all_calinski_scores = [] silhouette_scores = [] calinski_scores = [] for i in index: cure_model = cure(docvecs, i, compression=compression) cure_model.process() clusters = cure_model.get_clusters() labels = [1] * len(docvecs) for ind in range(len(clusters)): for element in clusters[ind]: labels[element] = ind print("Performance with threshold %d:" % i) silhouette_scores.append(metrics.silhouette_score(docvecs, labels)) calinski_scores.append(metrics.calinski_harabaz_score(docvecs, labels)) plt.subplot(1, 2, 1) plt.plot(index, silhouette_scores, label=str(compression)) plt.legend() plt.title("silhouette_scores")