def compute_bench_2(chunks): results = defaultdict(lambda: []) n_features = 50000 means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) X = np.empty((0, 2)) for i in range(8): X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] max_it = len(chunks) it = 0 for chunk in chunks: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() print('Fast K-Means') tstart = time() mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=8, batch_size=chunk) mbkmeans.fit(X) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.3fs" % mbkmeans.inertia_) print() results['MiniBatchKMeans Speed'].append(delta) results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) return results
def compute_bench_2(chunks): results = defaultdict(lambda: []) n_features = 50000 means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) X = np.empty((0, 2)) for i in range(8): X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] max_it = len(chunks) it = 0 for chunk in chunks: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() print('Fast K-Means') tstart = time() mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=8, batch_size=chunk) mbkmeans.fit(X) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.3fs" % mbkmeans.inertia_) print() results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
class MiniBatchKMeansImpl(): def __init__(self, n_clusters=8, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01): self._hyperparams = { 'n_clusters': n_clusters, 'init': init, 'max_iter': max_iter, 'batch_size': batch_size, 'verbose': verbose, 'compute_labels': compute_labels, 'random_state': random_state, 'tol': tol, 'max_no_improvement': max_no_improvement, 'init_size': init_size, 'n_init': n_init, 'reassignment_ratio': reassignment_ratio} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
def compute_bench(samples_range, features_range): it = 0 iterations = 200 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '==============================' print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) print 'K-Means' tstart = time() kmeans = KMeans(init='k-means++', k=10).fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %0.5f" % kmeans.inertia_ print '' results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print 'Fast K-Means' # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', k=10, batch_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %f" % mbkmeans.inertia_ print '' print '' results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def compute_bench(samples_range, features_range): it = 0 iterations = 200 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '==============================' print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) print 'K-Means' tstart = time() kmeans = KMeans(init='k-means++', k=10).fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %0.5f" % kmeans.inertia_ print '' results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print 'Fast K-Means' # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', k=10, chunk_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %f" % mbkmeans.inertia_ print '' print '' results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() data = nr.randint(-50, 51, (n_samples, n_features)) print('K-Means') tstart = time() kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.5f" % kmeans.inertia_) print() results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print('Fast K-Means') # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=10, batch_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %f" % mbkmeans.inertia_) print() print() results['MiniBatchKMeans Speed'].append(delta) results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) return results
def get_centroids(w2v_model, aspects_count): """ Clustering all word vectors with K-means and returning L2-normalizes cluster centroids; used for ABAE aspects matrix initialization """ km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100) m = [] for k in w2v_model.wv.vocab: m.append(w2v_model.wv[k]) m = np.matrix(m) km.fit(m) clusters = km.cluster_centers_ # L2 normalization norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True) return norm_aspect_matrix
import numpy as np import os from sklearn.cluster.k_means_ import KMeans from sklearn.cluster.k_means_ import MiniBatchKMeans import cPickle import sys # Performs K-means clustering and save the model to a local file if __name__ == '__main__': if len(sys.argv) != 4: print "Usage: {0} mfcc_csv_file cluster_num output_file".format(sys.argv[0]) print "mfcc_csv_file -- path to the mfcc csv file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) mfcc_csv_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) X = np.loadtxt(mfcc_csv_file, delimiter = ';') print (np.shape(X)) kmeans = MiniBatchKMeans(n_clusters=cluster_num, random_state=0) kmeans.fit(X) cPickle.dump(kmeans, open(output_file, "wb")) print ('K-means trained successfully!')
clustval = int(round(float(num_posts)/100.0)) * clust_size posts_vectors = [] for post in posts: try: posts_vectors.append(pickle.loads(post.vector).toarray()[0]) except Exception, e: make_vector(post, a) posts_vectors.append(pickle.loads(post.vector).toarray()[0]) num_clusters = num_posts - clustval km = MiniBatchKMeans(n_clusters=num_clusters) km.fit(posts_vectors) clusters = km.labels_.tolist() cluster_centers = km.cluster_centers_ cluster_dict = {} for cluster in clusters: if cluster not in cluster_dict: cluster_dict[cluster] = 0 cluster_dict[cluster] += 1 count_dict = {} for cluster, num in cluster_dict.items(): if num != 1:
tfidf_X = vectorizer.fit_transform(dataset.values()) n_components = 80 # SVD print("Reducing dimensions..") svd = TruncatedSVD(n_components=n_components, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tfidf_X = lsa.fit_transform(tfidf_X) #Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now) model = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20) model.fit(tfidf_X) assignments = model.predict( lsa.transform(vectorizer.transform(dataset.values()))) clusters = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20).fit_predict(tfidf_X) def dump_to_file(filename, assignments, dataset): with open(filename, mode="w", newline="") as csvfile: # Headers fieldnames = ['Id', 'Predicted'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader()