def cluster(self,vectors): if self.clustering_params['method'] == "KMeans_NLTK": kmeans = KMeansClusterer(num_means=20, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True) self.dataset['cluster'] = \ kmeans.cluster(vectors, assign_clusters=True) elif self.clustering_params['method'] == "KMeans": kmeans = KMeans(n_clusters=eval(self.clustering_params['n_clusters'])) kmeans.fit(vectors) clusters = kmeans.cluster_centers_ self.dataset['cluster'] = kmeans.predict(self.vectors) print(self.dataset[['cluster', 'id', 'text']]) self.clustered_filename = f'{self.disaster_name}_{self.clustering_params["method"]}' + \ f'_{self.clustering_params["n_clusters"]}'.replace(" ", "_") current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") self.dataset.to_csv( f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}.csv", index=False) filename = f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}" with open(filename+'.pkl', 'wb') as file: pickle.dump(kmeans, file) file.close() with open(filename+'.vec', 'wb') as file: pickle.dump(self.vectors, file) file.close() return self.dataset, filename+'.pkl'
quoting=csv.QUOTE_ALL) #nltk GAAClusterer model = GAAClusterer(num_clusters=cluster_number) model.cluster(vectors, assign_clusters=True) clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors] data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv', index=True, quoting=csv.QUOTE_ALL) #sklearn means model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8) model.fit(vectors) dump(model, '../data/advanced_sklearn_kmeans.joblib') data['cluster'] = pd.DataFrame(model.labels_) data[['text', 'cluster']].to_csv('../data/text_clustered_sklearn_kmeans.csv', index=True, quoting=csv.QUOTE_ALL) #sklearn agglomerative model = AgglomerativeClustering(n_clusters=cluster_number) clusters = model.fit_predict(vectors) data['cluster'] = pd.DataFrame(clusters) data[['text', 'cluster']].to_csv('../data/text_clustered_sklearn_agglomerative.csv', index=True, quoting=csv.QUOTE_ALL)