def clustermachine(matrix, distance_metric, clusters=4): """ The clustermachine takes a matrix with word freqs and clusters according to the distance_metric. Clusters sets the input if algorithm needs a pre-determined number of clusters. Last two will not be used by all algorithms. """ no_of_clusters = range(clusters) result = [] t = time.time() ## # 1: kmeans # for x in [2,4,6,8,10]: # model=sklearn.cluster.KMeans(x,tol=0) # clustering=model.fit(matrix) # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # inertia=clustering.inertia_ # kmeans=ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) # result.append(kmeans) # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # u=time.time() # print (u-t)/60 # # # ## #2: MeanShift, takes forever @ 12600, 42 # model=sklearn.cluster.MeanShift() # clustering=model.fit(matrix) # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # meanshift=ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) # result.append(meanshift) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 # # 3: Affinity Propagation, breaks @ 12600, 42 # model=sklearn.cluster.AffinityPropagation() # clustering=model.fit(matrix) # centroid_index=model.cluster_centers_indices_ # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # aff_matrix=clustering.affinity_matrix_ # its= clustering.n_iter_ # affinity=ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) # result.append(affinity) # u=time.time() # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 # ## #4: Spectral clustering # model=sklearn.cluster.SpectralClustering() # clustering=model.fit(matrix) # labels=clustering.labels_ # aff_matrix=clustering.affinity_matrix_ # spectral= ct.Clustering(model, clustering.labels_) # result.append(spectral) # u=time.time() # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 ##watch out --------- centroids are indices!!!!! # ## # 5: DBCASN, takes forever @ 12600, 42 # for x in [2,4,8,16,32]:#[0.175, 0.2, 0.225, 0.3]: # model=sklearn.cluster.DBSCAN(eps=x, metric=distance_metric, algorithm='brute') # clustering=model.fit(matrix) # core_samples=clustering.core_sample_indices_ # #print "core samples", matrix[clustering.core_sample_indices_] # components=clustering.components_ # print "components", len(components) # labels=clustering.labels_ # print labels # dbscan= ct.Clustering(model, clustering.labels_, matrix[clustering.core_sample_indices_]) # result.append(dbscan) # u=time.time() # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 # # ##GUASSIN DOEs NOT FIT OUR SCHEMA AT THIS POINT # ## 6: GAUSSIAN MIXTURE. # for x in [2,4,6,8,12,16,20,24]: # model=sklearn.mixture.DPGMM(x, n_iter=100, verbose=0) # print "initial weights", model.weights_ # print "initial components", model.n_components # print "initial converge", model.converged_ # model.fit(matrix) # print "trained weights", model.weights_ # print "trained components", model.n_components # print "trained converge", model.converged_ # print "\n predict", model.predict(matrix) # print "means", model.means_ # #print "\n predict probs", model.predict_proba(matrix) # dirichlet= ct.Clustering(model, model.fit_predict(matrix), model.means_) # u=time.time() # result.append(dirichlet) # print (u-t)/60 # # for x in [2,4,8,16,32]: # model=sklearn.mixture.GMM(x, n_iter=500, verbose=0) # print "initial weights", model.weights_ # print "initial components", model.n_components # print "initial converge", model.converged_ # model.fit(matrix) # print "trained weights", model.weights_ # print "trained components", model.n_components # print "trained converge", model.converged_ # print "\n predict", model.predict(matrix) # print "means", model.means_ # #print "\n predict probs", model.predict_proba(matrix) # gauss= ct.Clustering(model, model.fit_predict(matrix), model.means_) # u=time.time() # result.append(gauss) # print (u-t)/60 # # #These are essentially trees; maybe need a different approach. They are kinda predictive # ## #7: Agglomerative for x in [4]: model = sklearn.cluster.AgglomerativeClustering( affinity=distance_metric, n_clusters=x, linkage='complete') clustering = model.fit(matrix) labels = clustering.labels_ leaves = clustering.n_leaves_ children = clustering.children_ components = clustering.n_components_ ward = ct.Clustering(model, clustering.labels_) result.append(ward) u = time.time() print[i.name for i in result ][len(result) - 1], [i.no_of_clusters for i in result][len(result) - 1] print(u - t) / 60 # # # # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 # model=sklearn.cluster.AgglomerativeClustering(affinity='cosine', linkage='complete') # clustering=model.fit(matrix) # labels=clustering.labels_ # leaves=clustering.n_leaves_ # components=clustering.n_components_ # ward= ct.Clustering(model, clustering.labels_) # result.append(ward) # u=time.time() # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 # ## #8: Birch Hierarchical # model=sklearn.cluster.Birch(threshold=0.025) # clustering=model.fit(matrix) # labels=clustering.labels_ # root=clustering.root_ # subcluster_labels=clustering.subcluster_labels_ # birch= ct.Clustering(model, clustering.labels_) # result.append(birch) # u=time.time() # print [i.name for i in result][len(result)-1], [i.no_of_clusters for i in result][len(result)-1] # print (u-t)/60 return (result)
def clustermachine(matrix, algorithm, clusters=3): #we need a similarity matrix similarity_matrix=metrics.pairwise.euclidean_distances(matrix) #meanshift and kmeans take features #others need distance matrixes no_of_clusters=range(clusters) result=[] ## # 1: kmeans model=sklearn.cluster.KMeans(clusters) clustering=model.fit(matrix) centroids=clustering.cluster_centers_ labels=clustering.labels_ inertia=clustering.inertia_ kmeans=ct.Clustering(matrix, model, clustering.labels_, clustering.cluster_centers_) result.append(kmeans) ## #2: MeanShift # model=sklearn.cluster.MeanShift() # clustering=model.fit(matrix) # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # meanshift=ct.Clustering(matrix, model, clustering.labels_, clustering.cluster_centers_) # result.append(meanshift) # # ## #3: Affinity Propagation # model=sklearn.cluster.AffinityPropagation() # clustering=model.fit(similarity_matrix) # centroid_index=model.cluster_centers_indices_ # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # aff_matrix=clustering.affinity_matrix_ # its= clustering.n_iter_ # affinity=ct.Cluster(matrix, model, clustering.labels_, clustering.cluster_centers_) # result.append(affinity) # ## #4: Spectral clustering # model=sklearn.cluster.SpectralClustering() # clustering=model.fit(similarity_matrix) # labels=clustering.labels_ # aff_matrix=clustering.affinity_matrix_ # spectral= ct.Clustering(matrix, model, clustering.labels_) # result.append(spectral) # # ##watch out --------- centroids are indices!!!!! # ## # 5: DBCASN # model=sklearn.cluster.DBSCAN() # clustering=model.fit(matrix) # core_samples=clustering.core_sample_indices_ # components=clustering.components_ # labels=clustering.labels_ # dbscan= ct.Clustering(matrix, model, clustering.labels_, clustering.core_sample_indices_) # result.append(dbscan) # ##GUASSIN DOEs NOT FIT OUR SCHEMA AT THIS POINT ## 6: GAUSSIAN MIXTURE. eh this does not really fit in here model=sklearn.mixture.GMM() clustering=model.fit(matrix) weights=model.weights_ means=model.means_ covars=model.covars_ converged=clustering.converged_ #These are essentially trees; maybe need a different approach. #They are kinda predictive ## #7: Agglomerative // Ward Hierarchical model=sklearn.cluster.AgglomerativeClustering() # clustering=model.fit(matrix) # labels=clustering.labels_ # leaves=clustering.n_leaves_ # components=clustering.n_components_ # ward= ct.Clustering(matrix, model, clustering.labels_) # result.append(ward) # # ## #8: Birch Hierarchical # model=sklearn.cluster.Birch(threshold=0.025) # clustering=model.fit(matrix) # labels=clustering.labels_ # root=clustering.root_ # subcluster_labels=clustering.subcluster_labels_ # birch= ct.Clustering(matrix, model, clustering.labels_) # result.append(birch) return(result)
def clustermachine(matrix, clusters=4): #we need a similarity matrix to feed into some of the algos similarity_matrix = metrics.pairwise.euclidean_distances(matrix) #meanshift and kmeans take features, others need distance matrixes no_of_clusters = range(clusters) result = [] t = time.time() ## # 1: kmeans model = sklearn.cluster.KMeans(clusters, tol=0) clustering = model.fit(matrix) centroids = clustering.cluster_centers_ labels = clustering.labels_ inertia = clustering.inertia_ kmeans = ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) result.append(kmeans) print[i.name for i in result][len(result) - 1] u = time.time() print(u - t) / 60 # ###CREATING CLUSTERS # #this makes clusters; takes the dataset (matrix) and the algorithm ## # 1: kmeans model = sklearn.cluster.KMeans(clusters) clustering = model.fit(matrix) centroids = clustering.cluster_centers_ labels = clustering.labels_ inertia = clustering.inertia_ kmeans = ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) result.append(kmeans) print[i.name for i in result][len(result) - 1] u = time.time() print(u - t) / 60 model = sklearn.cluster.KMeans(8) clustering = model.fit(matrix) centroids = clustering.cluster_centers_ labels = clustering.labels_ inertia = clustering.inertia_ kmeans2 = ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) result.append(kmeans2) print[i.name for i in result][len(result) - 1] u = time.time() print(u - t) / 60 # ## #2: MeanShift, takes forever @ 12600, 42 # model=sklearn.cluster.MeanShift() # clustering=model.fit(matrix) # centroids=clustering.cluster_centers_ # labels=clustering.labels_ # meanshift=ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) # result.append(meanshift) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 # # 3: Affinity Propagation, breaks @ 12600, 42 model = sklearn.cluster.AffinityPropagation() clustering = model.fit(matrix) centroid_index = model.cluster_centers_indices_ centroids = clustering.cluster_centers_ labels = clustering.labels_ aff_matrix = clustering.affinity_matrix_ its = clustering.n_iter_ affinity = ct.Clustering(model, clustering.labels_, clustering.cluster_centers_) result.append(affinity) u = time.time() print[i.name for i in result][len(result) - 1] print(u - t) / 60 # ## #4: Spectral clustering # model=sklearn.cluster.SpectralClustering() # clustering=model.fit(matrix) # labels=clustering.labels_ # aff_matrix=clustering.affinity_matrix_ # spectral= ct.Clustering(model, clustering.labels_) # result.append(spectral) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 ##watch out --------- centroids are indices!!!!! ## ## # 5: DBCASN, eanShift, takes forever @ 12600, 42 # model=sklearn.cluster.DBSCAN() # clustering=model.fit(matrix) # core_samples=clustering.core_sample_indices_ # components=clustering.components_ # labels=clustering.labels_ # dbscan= ct.Clustering(model, clustering.labels_, clustering.core_sample_indices_) # result.append(dbscan) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 # # ##GUASSIN DOEs NOT FIT OUR SCHEMA AT THIS POINT # ## 6: GAUSSIAN MIXTURE. eh this does not really fit in here # model=sklearn.mixture.GMM() # clustering=model.fit(matrix) # weights=model.weights_ # means=model.means_ # covars=model.covars_ # converged=clustering.converged_ # u=time.time() # result.append(dbscan) # print [i.name for i in result][len(result)-1] # print (u-t)/60 # # #These are essentially trees; maybe need a different approach. They are kinda predictive # ## #7: Agglomerative // Ward Hierarchical # model=sklearn.cluster.AgglomerativeClustering() # clustering=model.fit(matrix) # labels=clustering.labels_ # leaves=clustering.n_leaves_ # components=clustering.n_components_ # ward= ct.Clustering(model, clustering.labels_) # result.append(ward) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 # # ## #8: Birch Hierarchical # model=sklearn.cluster.Birch(threshold=0.025) # clustering=model.fit(matrix) # labels=clustering.labels_ # root=clustering.root_ # subcluster_labels=clustering.subcluster_labels_ # birch= ct.Clustering(model, clustering.labels_) # result.append(birch) # u=time.time() # print [i.name for i in result][len(result)-1] # print (u-t)/60 return (result)