def decompose_and_cluster(tasks, word2vec, output_file, method='KMeans', option=10): """ You should pass parameter 'method', 'option' as follows method = 'DBSCAN' or 'KMeans' option = eps or n_clusters """ print 'Get task vector' whole_vector = tasks_to_vectors(tasks, word2vec) print 'Down dimension...' pca = PCA(5) d_vector = pca.fit_transform(whole_vector) # print 'PCA Log Likelihood Score : ' + str(pca.score()) if method=='KMeans': print 'Training K-means...' cluster = KMeans(n_clusters=option, n_jobs=3) else: print 'Training DBSCAN ... ' cluster = DBSCAN(eps=option) cluster.fit(d_vector) labels = cluster.predict(d_vector) pipe = Pipeline(steps=[ ('w2v_200_to_5_PCA', pca), ('clustering', cluster) ]) if os.path.exists(output_file): os.remove(output_file) joblib.dump(pipe, output_file, compress=3) print 'Complete dumping' return pipe, labels
def decompose_and_cluster(tasks, word2vec, output_file, n_clusters): print "Get task vector" whole_vector = tasks_to_vectors(tasks, word2vec) print "Down dimension..." pca = PCA(5) d_vector = pca.fit_transform(whole_vector) print "Training K-means..." kmeans = KMeans(n_clusters=n_clusters, n_jobs=3) kmeans.fit(d_vector) labels = kmeans.predict(d_vector) pipe = Pipeline(steps=[("w2v_200_to_5_PCA", pca), ("Kmeans_1000", kmeans)]) if os.path.exists(output_file): os.remove(output_file) joblib.dump(pipe, output_file, compress=3) print "Complete dumping" return pipe, labels