def run_fkmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll): params = { 'newsgroup': { 'k': [20], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'ig': { 'k': [13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'igtoy': { 'k': [3], 'l': [2, 3, 4, 5, 6], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'nips': { 'k': [9], 'l': [5, 7, 9, 11, 13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] } } output_file = codecs.open(dataset_name + '_fuzzy_cmeans_news_results.csv', 'w', 'utf-8') output_file.write('X,K,NMI,RAND,DAVIES\n') output_file.flush() for k in params[dataset_name]['k']: for data_str in params[dataset_name]['X']: data = eval(data_str) data = data.toarray().astype(np.float64) error_best = np.inf for _ in range(10): tick1 = time.time() centroids, U, _, _, errors, _, _ = fuzz.cluster.cmeans( data.T, k, 2, error=0.00000000001, maxiter=10000) tick2 = time.time() print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'fkmeans')) labels_pred = np.argmax(U, axis=0) error = errors[-1] nmi_score = normalized_mutual_info_score(labels_true, labels_pred) rand_score = adjusted_rand_score(labels_true, labels_pred) davies_score = davies_bouldin_score(data, labels_pred, centroids) tick3 = time.time() print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'fkmeans')) output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score)) output_file.flush() print('Execution: X: {}, k: {}'.format(data_str, k)) print('NMI score: {}'.format(nmi_score)) print('Rand score: {}'.format(rand_score)) print('Davies score: {}'.format(davies_score)) print('-----------------------------------------------\n') output_file.close()
def run_kmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll): params = { 'newsgroup': { 'k': [10, 15, 20, 25, 30], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'ig': { 'k': [13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'igtoy': { 'k': [3], 'l': [2, 3, 4, 5, 6], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'nips': { 'k': [9], 'l': [5, 7, 9, 11, 13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] } } output_file = codecs.open(dataset_name + '_kmeans_news_results.csv', 'w', 'utf-8') output_file.write('X,K,NMI,RAND,DAVIES\n') for k in params[dataset_name]['k']: for data_str in params[dataset_name]['X']: data = eval(data_str) data = data.toarray().astype(np.float64) error_best = np.inf for _ in range(10): tick1 = time.time() datat = data.T # n, _ = data.shape # temp = np.diag(np.squeeze(np.asarray((data.dot(datat).dot(np.ones(n).reshape(n, 1)))))) # d = datat.dot(np.sqrt(temp)) estimator = KMeans(n_clusters=k, max_iter=10000) estimator.fit(data) tick2 = time.time() print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'kmeans')) labels_pred = estimator.labels_ centroids = estimator.cluster_centers_ error = estimator.inertia_ nmi_score = normalized_mutual_info_score(labels_true, labels_pred) rand_score = adjusted_rand_score(labels_true, labels_pred) davies_score = davies_bouldin_score(data, labels_pred, centroids) tick3 = time.time() print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'kmeans')) output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score)) print('Execution: X: {}, k: {}'.format(data_str, k)) print('NMI score: {}'.format(nmi_score)) print('Rand score: {}'.format(rand_score)) print('Davies score: {}'.format(davies_score)) print('-----------------------------------------------\n') output_file.close()
def run_bin_ovnmtf(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll): params = { 'newsgroup': { 'k': [20], 'l': [15, 20, 25, 30], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'igtoy': { 'k': [3], 'l': [2, 3, 4, 5, 6], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'ig': { 'k': [7, 10, 13, 16, 19], 'l': [7, 10, 13, 16, 19], 'X': ['X_train_norm_tfidf'] # 'X': ['X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'nips': { 'k': [9], 'l': [6, 9, 12, 15, 18], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] } } if kk: filename = dataset_name + '_kk=' + str(kk) + '_ll=' + str(ll) + '_X=' + params[dataset_name]['X'][0] + '_bin_ovnmtf_news_results.csv' params[dataset_name]['k'] = [int(kk)] params[dataset_name]['l'] = [int(ll)] else: filename = dataset_name + '_bin_ovnmtf_news_results.csv' out_f = codecs.open(filename, 'w', 'utf-8') out_f.write('X,K,L,NMI,RAND,DAVIES\n') for k in params[dataset_name]['k']: for l in params[dataset_name]['l']: for data_str in params[dataset_name]['X']: data = eval(data_str) data = data.toarray().astype(np.float64) h5f = h5py.File('data.h5', 'w') h5f.create_dataset('X', data=data.T) h5f.close() error_best = np.inf for _ in xrange(10): tick1 = time.time() # U, S, V, labels_pred, _, error = fnmtf(data, k, l) proc = subprocess.Popen(['./algos', 'bin_ovnmtf', str(k), str(l), '10000'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = proc.communicate() print('out: {}'.format(out)) U = np.genfromtxt('U.csv', delimiter=',') S = np.genfromtxt('S.csv', delimiter=',') # V = np.genfromtxt('V.csv', delimiter=',') with open('error.csv') as f: error = float(f.read()) labels_pred = np.argmax(U, axis=1) tick2 = time.time() print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'bin_ovnmtf')) nmi_score = normalized_mutual_info_score(labels_true, labels_pred) rand_score = adjusted_rand_score(labels_true, labels_pred) davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k)) out_f.write(u'{},{},{},{},{},{}\n'.format(data_str, k, l, nmi_score, rand_score, davies_score)) print('Execution: X: {}, k: {}, l: {}'.format(data_str, k, l)) print('Algo error: {}'.format(error)) print('NMI score: {}'.format(nmi_score)) print('Rand score: {}'.format(rand_score)) print('Davies score: {}'.format(davies_score)) print('-----------------------------------------------\n')
def clustering_and_metrics(dataset, clustering_alg): samples_to_delete = np.array([]) cleanlabels = np.array([]) clusters = {} l_clustering_alg = [ 'kmeans_++', 'kmeans_random', 'kmeans_pca', 'dbscan', 'birch', 'meanshift', ] # Scale data scaleddata = StandardScaler().fit_transform(dataset) # Clustering phase if clustering_alg == 'kmeans_++': estimator, c_elap_time = k_means_clustering(data=scaleddata, plot=0, p_init='k-means++', p_n_init=10, p_n_jobs=parallelism) elif clustering_alg == 'kmeans_random': estimator, c_elap_time = k_means_clustering(data=scaleddata, plot=0, p_init='random', p_n_init=10, p_n_jobs=parallelism) elif clustering_alg == 'kmeans_pca': estimator, c_elap_time = k_means_clustering(data=scaleddata, plot=0, p_init='PCA-based', p_n_init=10, p_n_jobs=parallelism) elif clustering_alg == 'dbscan': estimator, c_elap_time = dbscan_clustering(data=scaleddata, plot=0, p_n_jobs=parallelism) elif clustering_alg == 'birch': estimator, c_elap_time = birch_clustering(data=scaleddata, plot=0, p_n_jobs=parallelism) elif clustering_alg == 'meanshift': estimator, c_elap_time = meanshift_clustering(data=scaleddata, plot=0, p_n_jobs=parallelism) else: print('Clustering algorithm not found') return {}, samples_to_delete, cleanlabels, {} # Split data in clusters clusters, sin_ele_clus, cleanscaleddata, cleanlabels, samples_to_delete, cluster_cnt, ignored_samples = split_data_in_clusters( estimator, scaleddata) for singleclus in clusters: print('Cluster ' + singleclus.__str__() + ':', len(clusters[singleclus])) # Compute clustering metrics clus_metrics = {} clus_metrics['name'] = clustering_alg clus_metrics['sin_ele_clus'] = sin_ele_clus clus_metrics['cluster_cnt'] = cluster_cnt clus_metrics['ignored_samples'] = ignored_samples # Check that more than 1 cluster was found if cluster_cnt <= 1: print('Less than ', min_clusters, ' clusters found. Skipping metrics calculation') clus_metrics['dunn_index'] = None clus_metrics['calinski_harabaz_score'] = None clus_metrics['silhouette_score'] = None clus_metrics['time'] = 0 clus_metrics['wb_index'] = None clus_metrics['davies_bouldin_score'] = None else: clus_metrics['time'] = round(c_elap_time, metric_decimals) clus_metrics['wb_index'] = float( round(wb_index(clusters, cleanscaleddata), metric_decimals)) clus_metrics['dunn_index'] = float( round(dunn_index(clusters), metric_decimals)) clus_metrics['calinski_harabaz_score'] = float( round(calinski_harabaz_score(cleanscaleddata, cleanlabels), metric_decimals)) clus_metrics['silhouette_score'] = float( round( silhouette_score(cleanscaleddata, cleanlabels, metric='euclidean', sample_size=None), metric_decimals)) # forcing data type due to insert error # Supress expected runtime "divide by zero" warning with warnings.catch_warnings(): warnings.simplefilter("ignore") clus_metrics['davies_bouldin_score'] = float( round(davies_bouldin_score(cleanscaleddata, cleanlabels), metric_decimals)) return clus_metrics, samples_to_delete, cleanlabels, clusters