def compute_bench_2(chunks): results = defaultdict(lambda: []) n_features = 50000 means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) X = np.empty((0, 2)) for i in range(8): X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] max_it = len(chunks) it = 0 for chunk in chunks: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() print('Fast K-Means') tstart = time() mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=8, batch_size=chunk) mbkmeans.fit(X) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.3fs" % mbkmeans.inertia_) print() results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def cluster_kmeans(n_samples): rng = np.random.RandomState(9) kmeans = MiniBatchKMeans(n_clusters=2, random_state=rng, verbose=True, compute_labels=True) i = 0; batch_size = 10 while(i < n_samples): #partial fit 100 authors and there subsequent comparisons print "k_means partial fitting, i = ", str(i) data = extract_data(start=i, end=i+batch_size) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) kmeans.partial_fit(data) i+=batch_size print "fitting of one-third data finished." return kmeans
def fit(self, sent_docs, y=None): self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim, min_count=self.vocab_cutoff, iter=self.iter) self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0) self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0) return self
def compute_bench(samples_range, features_range): it = 0 iterations = 200 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '==============================' print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) print 'K-Means' tstart = time() kmeans = KMeans(init='k-means++', k=10).fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %0.5f" % kmeans.inertia_ print '' results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print 'Fast K-Means' # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', k=10, batch_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %f" % mbkmeans.inertia_ print '' print '' results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def compute_bench(samples_range, features_range): it = 0 iterations = 200 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '==============================' print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) print 'K-Means' tstart = time() kmeans = KMeans(init='k-means++', k=10).fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %0.5f" % kmeans.inertia_ print '' results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print 'Fast K-Means' # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', k=10, chunk_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %f" % mbkmeans.inertia_ print '' print '' results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() data = nr.randint(-50, 51, (n_samples, n_features)) print('K-Means') tstart = time() kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.5f" % kmeans.inertia_) print() results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print('Fast K-Means') # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=10, batch_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %f" % mbkmeans.inertia_) print() print() results['MiniBatchKMeans Speed'].append(delta) results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) return results
def get_centroids(w2v_model, aspects_count): """ Clustering all word vectors with K-means and returning L2-normalizes cluster centroids; used for ABAE aspects matrix initialization """ km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100) m = [] for k in w2v_model.wv.vocab: m.append(w2v_model.wv[k]) m = np.matrix(m) km.fit(m) clusters = km.cluster_centers_ # L2 normalization norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True) return norm_aspect_matrix
class ClusteredEmbeddingsVectorizer(BaseEstimator): def __init__(self, embedding_dim=300, n_clusters=500, vocab_cutoff=5, iter=5): self._w2v_model = None self._kmeans_model = None self._id2cluster = None self.embedding_dim = embedding_dim self.n_clusters = n_clusters self.vocab_cutoff = vocab_cutoff self.iter = iter def fit(self, sent_docs, y=None): self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim, min_count=self.vocab_cutoff, iter=self.iter) self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0) self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0) return self def transform(self, sent_docs): v = lil_matrix((len(sent_docs), self._kmeans_model.n_clusters)) for i, sent in enumerate(sent_docs): for token in sent: idx = self._word2cluster(token) if idx: v[i, idx] += 1. return v def _word2cluster(self, token): try: return self._id2cluster[self._w2v_model.vocab[token].index] except KeyError: return None
'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(), 'LinearRegression':LinearRegression(), 'LinearSVC':LinearSVC(), 'LinearSVR':LinearSVR(), 'LocallyLinearEmbedding':LocallyLinearEmbedding(), 'LogisticRegression':LogisticRegression(), 'LogisticRegressionCV':LogisticRegressionCV(), 'MDS':MDS(), 'MLPClassifier':MLPClassifier(), 'MLPRegressor':MLPRegressor(), 'MaxAbsScaler':MaxAbsScaler(), 'MeanShift':MeanShift(), 'MinCovDet':MinCovDet(), 'MinMaxScaler':MinMaxScaler(), 'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(), 'MiniBatchKMeans':MiniBatchKMeans(), 'MiniBatchSparsePCA':MiniBatchSparsePCA(), 'MultiTaskElasticNet':MultiTaskElasticNet(), 'MultiTaskElasticNetCV':MultiTaskElasticNetCV(), 'MultiTaskLasso':MultiTaskLasso(), 'MultiTaskLassoCV':MultiTaskLassoCV(), 'MultinomialNB':MultinomialNB(), 'NMF':NMF(), 'NearestCentroid':NearestCentroid(), 'NearestNeighbors':NearestNeighbors(), 'Normalizer':Normalizer(), 'NuSVC':NuSVC(), 'NuSVR':NuSVR(), 'Nystroem':Nystroem(), 'OAS':OAS(), 'OneClassSVM':OneClassSVM(),
clust_size = int(round(((float(cluster_size) * (90 - 60)) / 100.0) + 60.0)) clustval = int(round(float(num_posts)/100.0)) * clust_size posts_vectors = [] for post in posts: try: posts_vectors.append(pickle.loads(post.vector).toarray()[0]) except Exception, e: make_vector(post, a) posts_vectors.append(pickle.loads(post.vector).toarray()[0]) num_clusters = num_posts - clustval km = MiniBatchKMeans(n_clusters=num_clusters) km.fit(posts_vectors) clusters = km.labels_.tolist() cluster_centers = km.cluster_centers_ cluster_dict = {} for cluster in clusters: if cluster not in cluster_dict: cluster_dict[cluster] = 0 cluster_dict[cluster] += 1 count_dict = {}
min_df=0.01, max_df=0.08, stop_words=stop_words) tfidf_X = vectorizer.fit_transform(dataset.values()) n_components = 80 # SVD print("Reducing dimensions..") svd = TruncatedSVD(n_components=n_components, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tfidf_X = lsa.fit_transform(tfidf_X) #Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now) model = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20) model.fit(tfidf_X) assignments = model.predict( lsa.transform(vectorizer.transform(dataset.values()))) clusters = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20).fit_predict(tfidf_X) def dump_to_file(filename, assignments, dataset): with open(filename, mode="w", newline="") as csvfile: # Headers fieldnames = ['Id', 'Predicted']