Exemple #1
0
class MiniBatchKMeansImpl():

    def __init__(self, n_clusters=8, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01):
        self._hyperparams = {
            'n_clusters': n_clusters,
            'init': init,
            'max_iter': max_iter,
            'batch_size': batch_size,
            'verbose': verbose,
            'compute_labels': compute_labels,
            'random_state': random_state,
            'tol': tol,
            'max_no_improvement': max_no_improvement,
            'init_size': init_size,
            'n_init': n_init,
            'reassignment_ratio': reassignment_ratio}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)
class ClusteredEmbeddingsVectorizer(BaseEstimator):
    def __init__(self, embedding_dim=300, n_clusters=500, vocab_cutoff=5, iter=5):
        self._w2v_model = None
        self._kmeans_model = None
        self._id2cluster = None

        self.embedding_dim = embedding_dim
        self.n_clusters = n_clusters
        self.vocab_cutoff = vocab_cutoff
        self.iter = iter

    def fit(self, sent_docs, y=None):
        self._w2v_model = Word2Vec(sentences=sent_docs, size=self.embedding_dim,
                                   min_count=self.vocab_cutoff, iter=self.iter)
        self._kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters).fit(self._w2v_model.syn0)
        self._id2cluster = self._kmeans_model.predict(self._w2v_model.syn0)

        return self

    def transform(self, sent_docs):
        v = lil_matrix((len(sent_docs), self._kmeans_model.n_clusters))

        for i, sent in enumerate(sent_docs):
            for token in sent:
                idx = self._word2cluster(token)

                if idx:
                    v[i, idx] += 1.

        return v

    def _word2cluster(self, token):
        try:
            return self._id2cluster[self._w2v_model.vocab[token].index]
        except KeyError:
            return None
n_components = 80
# SVD
print("Reducing dimensions..")
svd = TruncatedSVD(n_components=n_components, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
tfidf_X = lsa.fit_transform(tfidf_X)

#Clustering TF-IDF ( MiniBatchKMEANS n=4 best for now)
model = MiniBatchKMeans(n_clusters=4,
                        init_size=1024,
                        batch_size=2048,
                        random_state=20)
model.fit(tfidf_X)
assignments = model.predict(
    lsa.transform(vectorizer.transform(dataset.values())))
clusters = MiniBatchKMeans(n_clusters=4,
                           init_size=1024,
                           batch_size=2048,
                           random_state=20).fit_predict(tfidf_X)


def dump_to_file(filename, assignments, dataset):
    with open(filename, mode="w", newline="") as csvfile:

        # Headers
        fieldnames = ['Id', 'Predicted']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for ids, cluster in zip(dataset.keys(), assignments):