def compute_bench_2(chunks):
    results = defaultdict(lambda: [])
    n_features = 50000
    means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1],
                      [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]])
    X = np.empty((0, 2))
    for i in range(8):
        X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)]
    max_it = len(chunks)
    it = 0
    for chunk in chunks:
        it += 1
        print('==============================')
        print('Iteration %03d of %03d' % (it, max_it))
        print('==============================')
        print()

        print('Fast K-Means')
        tstart = time()
        mbkmeans = MiniBatchKMeans(init='k-means++',
                                   n_clusters=8,
                                   batch_size=chunk)

        mbkmeans.fit(X)
        delta = time() - tstart
        print("Speed: %0.3fs" % delta)
        print("Inertia: %0.3fs" % mbkmeans.inertia_)
        print()

        results['minibatchkmeans_speed'].append(delta)
        results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
Exemple #2
0
def cluster_kmeans(n_samples):    
    rng = np.random.RandomState(9)
    kmeans = MiniBatchKMeans(n_clusters=2, random_state=rng, verbose=True, compute_labels=True)
    i = 0; 
    batch_size = 10
    
    while(i < n_samples):
        #partial fit 100 authors and there subsequent comparisons  
        print "k_means partial fitting, i = ", str(i)      
        data = extract_data(start=i, end=i+batch_size)
        data -= np.mean(data, axis=0)
        data /= np.std(data, axis=0)
        kmeans.partial_fit(data)
        i+=batch_size     
    print "fitting of one-third data finished."
    return kmeans
    def fit(self, sent_docs, y=None):
        self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim,
                                   min_count=self.vocab_cutoff, iter=self.iter)
        self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0)
        self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0)

        return self
def compute_bench(samples_range, features_range):

    it = 0
    iterations = 200
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '=============================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '=============================='
            print ''
            data = nr.random_integers(-50, 50, (n_samples, n_features))

            print 'K-Means'
            tstart = time()
            kmeans = KMeans(init='k-means++',
                            k=10).fit(data)

            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %0.5f" % kmeans.inertia_
            print ''

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print 'Fast K-Means'
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                      k=10,
                                      batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %f" % mbkmeans.inertia_
            print ''
            print ''

            results['minibatchkmeans_speed'].append(delta)
            results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
Exemple #5
0
def compute_bench(samples_range, features_range):

    it = 0
    iterations = 200
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '=============================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '=============================='
            print ''
            data = nr.random_integers(-50, 50, (n_samples, n_features))

            print 'K-Means'
            tstart = time()
            kmeans = KMeans(init='k-means++', k=10).fit(data)

            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %0.5f" % kmeans.inertia_
            print ''

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print 'Fast K-Means'
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       k=10,
                                       chunk_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %f" % mbkmeans.inertia_
            print ''
            print ''

            results['minibatchkmeans_speed'].append(delta)
            results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
def compute_bench(samples_range, features_range):

    it = 0
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('==============================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('==============================')
            print()
            data = nr.randint(-50, 51, (n_samples, n_features))

            print('K-Means')
            tstart = time()
            kmeans = KMeans(init='k-means++', n_clusters=10).fit(data)

            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %0.5f" % kmeans.inertia_)
            print()

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print('Fast K-Means')
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       n_clusters=10,
                                       batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %f" % mbkmeans.inertia_)
            print()
            print()

            results['MiniBatchKMeans Speed'].append(delta)
            results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_)

    return results
Exemple #7
0
def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.vocab:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)

    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix
class ClusteredEmbeddingsVectorizer(BaseEstimator):
    def __init__(self, embedding_dim=300, n_clusters=500, vocab_cutoff=5, iter=5):
        self._w2v_model = None
        self._kmeans_model = None
        self._id2cluster = None

        self.embedding_dim = embedding_dim
        self.n_clusters = n_clusters
        self.vocab_cutoff = vocab_cutoff
        self.iter = iter

    def fit(self, sent_docs, y=None):
        self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim,
                                   min_count=self.vocab_cutoff, iter=self.iter)
        self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0)
        self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0)

        return self

    def transform(self, sent_docs):
        v = lil_matrix((len(sent_docs), self._kmeans_model.n_clusters))

        for i, sent in enumerate(sent_docs):
            for token in sent:
                idx = self._word2cluster(token)

                if idx:
                    v[i, idx] += 1.

        return v

    def _word2cluster(self, token):
        try:
            return self._id2cluster[self._w2v_model.vocab[token].index]
        except KeyError:
            return None
Exemple #9
0
			'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
			'LinearRegression':LinearRegression(),
			'LinearSVC':LinearSVC(),
			'LinearSVR':LinearSVR(),
			'LocallyLinearEmbedding':LocallyLinearEmbedding(),
			'LogisticRegression':LogisticRegression(),
			'LogisticRegressionCV':LogisticRegressionCV(),
			'MDS':MDS(),
			'MLPClassifier':MLPClassifier(),
			'MLPRegressor':MLPRegressor(),
			'MaxAbsScaler':MaxAbsScaler(),
			'MeanShift':MeanShift(),
			'MinCovDet':MinCovDet(),
			'MinMaxScaler':MinMaxScaler(),
			'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(),
			'MiniBatchKMeans':MiniBatchKMeans(),
			'MiniBatchSparsePCA':MiniBatchSparsePCA(),
			'MultiTaskElasticNet':MultiTaskElasticNet(),
			'MultiTaskElasticNetCV':MultiTaskElasticNetCV(),
			'MultiTaskLasso':MultiTaskLasso(),
			'MultiTaskLassoCV':MultiTaskLassoCV(),
			'MultinomialNB':MultinomialNB(),
			'NMF':NMF(),
			'NearestCentroid':NearestCentroid(),
			'NearestNeighbors':NearestNeighbors(),
			'Normalizer':Normalizer(),
			'NuSVC':NuSVC(),
			'NuSVR':NuSVR(),
			'Nystroem':Nystroem(),
			'OAS':OAS(),
			'OneClassSVM':OneClassSVM(),
Exemple #10
0
    clust_size = int(round(((float(cluster_size) * (90 - 60)) / 100.0) + 60.0))

    clustval = int(round(float(num_posts)/100.0)) * clust_size
    
    posts_vectors = []
    for post in posts:
        try:
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
        except Exception, e:
            make_vector(post, a)
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
    
    num_clusters = num_posts - clustval
     
    km = MiniBatchKMeans(n_clusters=num_clusters)
     
    km.fit(posts_vectors)
     
    clusters = km.labels_.tolist()
    
    cluster_centers = km.cluster_centers_
    
    cluster_dict = {}
    for cluster in clusters:
        if cluster not in cluster_dict:
            cluster_dict[cluster] = 0
        cluster_dict[cluster] += 1
        
    count_dict = {}
    
Exemple #11
0
    clust_size = int(round(((float(cluster_size) * (90 - 60)) / 100.0) + 60.0))

    clustval = int(round(float(num_posts)/100.0)) * clust_size
    
    posts_vectors = []
    for post in posts:
        try:
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
        except Exception, e:
            make_vector(post, a)
            posts_vectors.append(pickle.loads(post.vector).toarray()[0])
    
    num_clusters = num_posts - clustval
     
    km = MiniBatchKMeans(n_clusters=num_clusters)
     
    km.fit(posts_vectors)
     
    clusters = km.labels_.tolist()
    
    cluster_centers = km.cluster_centers_
    
    cluster_dict = {}
    for cluster in clusters:
        if cluster not in cluster_dict:
            cluster_dict[cluster] = 0
        cluster_dict[cluster] += 1
        
    count_dict = {}
    
                             min_df=0.01,
                             max_df=0.08,
                             stop_words=stop_words)
tfidf_X = vectorizer.fit_transform(dataset.values())

n_components = 80
# SVD
print("Reducing dimensions..")
svd = TruncatedSVD(n_components=n_components, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
tfidf_X = lsa.fit_transform(tfidf_X)

#Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now)
model = MiniBatchKMeans(n_clusters=4,
                        init_size=1024,
                        batch_size=2048,
                        random_state=20)
model.fit(tfidf_X)
assignments = model.predict(
    lsa.transform(vectorizer.transform(dataset.values())))
clusters = MiniBatchKMeans(n_clusters=4,
                           init_size=1024,
                           batch_size=2048,
                           random_state=20).fit_predict(tfidf_X)


def dump_to_file(filename, assignments, dataset):
    with open(filename, mode="w", newline="") as csvfile:

        # Headers
        fieldnames = ['Id', 'Predicted']