class MiniBatchKMeansImpl(): def __init__(self, n_clusters=8, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01): self._hyperparams = { 'n_clusters': n_clusters, 'init': init, 'max_iter': max_iter, 'batch_size': batch_size, 'verbose': verbose, 'compute_labels': compute_labels, 'random_state': random_state, 'tol': tol, 'max_no_improvement': max_no_improvement, 'init_size': init_size, 'n_init': n_init, 'reassignment_ratio': reassignment_ratio} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
class ClusteredEmbeddingsVectorizer(BaseEstimator): def __init__(self, embedding_dim=300, n_clusters=500, vocab_cutoff=5, iter=5): self._w2v_model = None self._kmeans_model = None self._id2cluster = None self.embedding_dim = embedding_dim self.n_clusters = n_clusters self.vocab_cutoff = vocab_cutoff self.iter = iter def fit(self, sent_docs, y=None): self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim, min_count=self.vocab_cutoff, iter=self.iter) self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0) self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0) return self def transform(self, sent_docs): v = lil_matrix((len(sent_docs), self._kmeans_model.n_clusters)) for i, sent in enumerate(sent_docs): for token in sent: idx = self._word2cluster(token) if idx: v[i, idx] += 1. return v def _word2cluster(self, token): try: return self._id2cluster[self._w2v_model.vocab[token].index] except KeyError: return None
n_components = 80 # SVD print("Reducing dimensions..") svd = TruncatedSVD(n_components=n_components, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tfidf_X = lsa.fit_transform(tfidf_X) #Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now) model = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20) model.fit(tfidf_X) assignments = model.predict( lsa.transform(vectorizer.transform(dataset.values()))) clusters = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20).fit_predict(tfidf_X) def dump_to_file(filename, assignments, dataset): with open(filename, mode="w", newline="") as csvfile: # Headers fieldnames = ['Id', 'Predicted'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for ids, cluster in zip(dataset.keys(), assignments):