def compute_bench_2(chunks):
    results = defaultdict(lambda: [])
    n_features = 50000
    means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0.5, 0.5],
                      [0.75, -0.5], [-1, 0.75], [1, 0]])
    X = np.empty((0, 2))
    for i in range(8):
        X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)]
    max_it = len(chunks)
    it = 0
    for chunk in chunks:
        it += 1
        print('==============================')
        print('Iteration %03d of %03d' % (it, max_it))
        print('==============================')
        print()

        print('Fast K-Means')
        tstart = time()
        mbkmeans = MiniBatchKMeans(init='k-means++',
                                   n_clusters=8,
                                   batch_size=chunk)

        mbkmeans.fit(X)
        delta = time() - tstart
        print("Speed: %0.3fs" % delta)
        print("Inertia: %0.3fs" % mbkmeans.inertia_)
        print()

        results['MiniBatchKMeans Speed'].append(delta)
        results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_)

    return results
def compute_bench_2(chunks):
    results = defaultdict(lambda: [])
    n_features = 50000
    means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1],
                      [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]])
    X = np.empty((0, 2))
    for i in range(8):
        X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)]
    max_it = len(chunks)
    it = 0
    for chunk in chunks:
        it += 1
        print('==============================')
        print('Iteration %03d of %03d' % (it, max_it))
        print('==============================')
        print()

        print('Fast K-Means')
        tstart = time()
        mbkmeans = MiniBatchKMeans(init='k-means++',
                                   n_clusters=8,
                                   batch_size=chunk)

        mbkmeans.fit(X)
        delta = time() - tstart
        print("Speed: %0.3fs" % delta)
        print("Inertia: %0.3fs" % mbkmeans.inertia_)
        print()

        results['minibatchkmeans_speed'].append(delta)
        results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
Exemple #3
0
class MiniBatchKMeansImpl():

    def __init__(self, n_clusters=8, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01):
        self._hyperparams = {
            'n_clusters': n_clusters,
            'init': init,
            'max_iter': max_iter,
            'batch_size': batch_size,
            'verbose': verbose,
            'compute_labels': compute_labels,
            'random_state': random_state,
            'tol': tol,
            'max_no_improvement': max_no_improvement,
            'init_size': init_size,
            'n_init': n_init,
            'reassignment_ratio': reassignment_ratio}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)
def compute_bench(samples_range, features_range):

    it = 0
    iterations = 200
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '=============================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '=============================='
            print ''
            data = nr.random_integers(-50, 50, (n_samples, n_features))

            print 'K-Means'
            tstart = time()
            kmeans = KMeans(init='k-means++',
                            k=10).fit(data)

            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %0.5f" % kmeans.inertia_
            print ''

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print 'Fast K-Means'
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                      k=10,
                                      batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %f" % mbkmeans.inertia_
            print ''
            print ''

            results['minibatchkmeans_speed'].append(delta)
            results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
Exemple #5
0
def compute_bench(samples_range, features_range):

    it = 0
    iterations = 200
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '=============================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '=============================='
            print ''
            data = nr.random_integers(-50, 50, (n_samples, n_features))

            print 'K-Means'
            tstart = time()
            kmeans = KMeans(init='k-means++', k=10).fit(data)

            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %0.5f" % kmeans.inertia_
            print ''

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print 'Fast K-Means'
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       k=10,
                                       chunk_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %f" % mbkmeans.inertia_
            print ''
            print ''

            results['minibatchkmeans_speed'].append(delta)
            results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
def compute_bench(samples_range, features_range):

    it = 0
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('==============================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('==============================')
            print()
            data = nr.randint(-50, 51, (n_samples, n_features))

            print('K-Means')
            tstart = time()
            kmeans = KMeans(init='k-means++', n_clusters=10).fit(data)

            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %0.5f" % kmeans.inertia_)
            print()

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print('Fast K-Means')
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       n_clusters=10,
                                       batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %f" % mbkmeans.inertia_)
            print()
            print()

            results['MiniBatchKMeans Speed'].append(delta)
            results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_)

    return results
def compute_bench(samples_range, features_range):

    it = 0
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('==============================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('==============================')
            print()
            data = nr.randint(-50, 51, (n_samples, n_features))

            print('K-Means')
            tstart = time()
            kmeans = KMeans(init='k-means++', n_clusters=10).fit(data)

            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %0.5f" % kmeans.inertia_)
            print()

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print('Fast K-Means')
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       n_clusters=10,
                                       batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %f" % mbkmeans.inertia_)
            print()
            print()

            results['MiniBatchKMeans Speed'].append(delta)
            results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_)

    return results
Exemple #8
0
def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.vocab:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)

    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix
Exemple #9
0
import numpy as np
import os
from sklearn.cluster.k_means_ import KMeans
from sklearn.cluster.k_means_ import MiniBatchKMeans

import cPickle
import sys

# Performs K-means clustering and save the model to a local file

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print "Usage: {0} mfcc_csv_file cluster_num output_file".format(sys.argv[0])
        print "mfcc_csv_file -- path to the mfcc csv file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    mfcc_csv_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])

    X = np.loadtxt(mfcc_csv_file, delimiter = ';')
    print (np.shape(X))
    kmeans = MiniBatchKMeans(n_clusters=cluster_num, random_state=0)
    kmeans.fit(X)
    cPickle.dump(kmeans, open(output_file, "wb"))

    print ('K-means trained successfully!')
Exemple #10
0
    clustval = int(round(float(num_posts)/100.0)) * clust_size
    
    posts_vectors = []
    for post in posts:
        try:
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
        except Exception, e:
            make_vector(post, a)
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
    
    num_clusters = num_posts - clustval
     
    km = MiniBatchKMeans(n_clusters=num_clusters)
     
    km.fit(posts_vectors)
     
    clusters = km.labels_.tolist()
    
    cluster_centers = km.cluster_centers_
    
    cluster_dict = {}
    for cluster in clusters:
        if cluster not in cluster_dict:
            cluster_dict[cluster] = 0
        cluster_dict[cluster] += 1
        
    count_dict = {}
    
    for cluster, num in cluster_dict.items():
        if num != 1:
Exemple #11
0
    clustval = int(round(float(num_posts)/100.0)) * clust_size
    
    posts_vectors = []
    for post in posts:
        try:
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
        except Exception, e:
            make_vector(post, a)
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
    
    num_clusters = num_posts - clustval
     
    km = MiniBatchKMeans(n_clusters=num_clusters)
     
    km.fit(posts_vectors)
     
    clusters = km.labels_.tolist()
    
    cluster_centers = km.cluster_centers_
    
    cluster_dict = {}
    for cluster in clusters:
        if cluster not in cluster_dict:
            cluster_dict[cluster] = 0
        cluster_dict[cluster] += 1
        
    count_dict = {}
    
    for cluster, num in cluster_dict.items():
        if num != 1:
tfidf_X = vectorizer.fit_transform(dataset.values())

n_components = 80
# SVD
print("Reducing dimensions..")
svd = TruncatedSVD(n_components=n_components, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
tfidf_X = lsa.fit_transform(tfidf_X)

#Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now)
model = MiniBatchKMeans(n_clusters=4,
                        init_size=1024,
                        batch_size=2048,
                        random_state=20)
model.fit(tfidf_X)
assignments = model.predict(
    lsa.transform(vectorizer.transform(dataset.values())))
clusters = MiniBatchKMeans(n_clusters=4,
                           init_size=1024,
                           batch_size=2048,
                           random_state=20).fit_predict(tfidf_X)


def dump_to_file(filename, assignments, dataset):
    with open(filename, mode="w", newline="") as csvfile:

        # Headers
        fieldnames = ['Id', 'Predicted']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()