def plot_consistency(labels, pos, mlset, nlset, savepath, consistency_type='both'): """ plot consistency distribution of given library Parameters ---------- :param labels: :param pos: :param mlset: :param nlset: :param savepath: :param consistency_type: """ texts = [] colors = [] plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE) markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE) for label in labels[0:-_ADDITIONAL_RANGE]: cons = Metrics.consistency(label, mlset, nlset, cons_type=consistency_type) texts.append(cons) cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap')) plot_labels.extend(_ADDITIONAL_NAMES) for text in texts: colors.append(scalarMap.to_rgba(text)) texts = map(_round_digits, texts) texts.append('') texts.extend(_ADDITIONAL_NAMES[1:]) colors.extend(_ADDITIONAL_COLORS) markers.extend(_ADDITIONAL_MARKERS) title = consistency_type + ' Consistency ,' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\ ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE])) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title) return
def plot_normalized_consistency(labels, mlset, nlset, savepath, additional_values): """ plot correlations between must and cannot consistency of given library Parameters ---------- :param labels: :param mlset: :param nlset: :param savepath: :param additional_values: """ texts = additional_values colors = [] plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE) markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE) cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap')) plot_labels.extend(_ADDITIONAL_NAMES) for text in texts: colors.append(scalarMap.to_rgba(text)) title = 'Must-Cannot Correlation' must_consistencies = [] cannot_consistencies = [] for label in labels[0:-5]: must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must') cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot') must_consistencies.append(must_cons) cannot_consistencies.append(cannot_cons) scaler = preprocessing.MinMaxScaler() must_consistencies = scaler.fit_transform(np.array(must_consistencies).reshape(-1, 1)) cannot_consistencies = scaler.fit_transform(np.array(cannot_consistencies).reshape(-1, 1)) pos = np.hstack((np.array(must_consistencies), np.array(cannot_consistencies))) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title, xlabel='Must consistency', ylabel='Cannot consistency', legend_need=False) return
def _expected_consistency_selection(labels, mlset, nlset, cons_type='', ease_factor=1): n_solutions = labels.shape[0] k_values = [] cons = [] final_idx = np.array([False] * n_solutions) for label in labels: cons.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) k_values.append(len(np.unique(label))) cons = np.array(cons) k_values = np.array(k_values, dtype=int) possible_k = np.unique(k_values) for k in possible_k: mean_value = np.mean(cons[k_values == k]) idx = np.logical_and(cons >= mean_value * ease_factor, k_values == k) final_idx = np.logical_or(final_idx, idx) return labels[final_idx]
def plot_k_consistency_distribution(labels, mlset, nlset, savepath, pure=True, cons_type='must'): k_value = [] if not pure: labels = labels[0:-5] for label in labels: cons = len(np.unique(label)) k_value.append(cons) texts = [''] * len(labels) plot_labels = [None] * len(labels) markers = ['x'] * len(labels) colors = ['blue'] * len(labels) title = 'k-'+cons_type+' consistency Correlation' consistencies = [] for label in labels: cons = Metrics.consistency(label, mlset, nlset, cons_type=cons_type) consistencies.append(cons) pos = np.hstack((np.array(k_value).reshape(-1, 1), np.array(consistencies).reshape(-1, 1))) print (pos.shape) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title, xlabel='k', ylabel='consistency', legend_need=False) return
def k_selection_ensemble(labels, k_threshold, logger, weighted=False, alpha=0, mlset=None, nlset=None, ctype='both'): """ do selection ensemble using k as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param k_threshold: :param logger: :param weighted: weighted version or not :param alpha: balance factor that control the importance of clustering/cluster consistency in weights (weighted version only) :param mlset: cannot-link set (weighted version only) :param nlset: must-link set (weighted version only) :param ctype: type of consistency (weighted version only) :return: """ k_value = [] class_num = len(np.unique(labels[-1])) # select those clusterings that k larger than the threshold. for label in labels[0:-5]: k_value.append(len(np.unique(label))) k_value = np.array(k_value) idx = k_value.ravel() >= k_threshold selected_labels = labels[0:-5][idx] # weights con_per_cluster = [] con_clustering = [] if weighted: for label in selected_labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=ctype)) for label in selected_labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=ctype)) logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape)) logger.debug('[K] Average k is ' + str(np.mean(k_value[idx]))) if weighted: logger.debug('[K] weighted consensus, alpha=' + str(alpha)) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) logger.debug('--------------------------------------------') return
def consistency_selection_ensemble(labels, mlset, nlset, logger, must_threshold, cannot_threshold, normalized=True, weighted=False, weighted_type='both', alpha=1): """ do selection ensemble using must/cannot consistency as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param mlset: :param nlset: :param logger: :param must_threshold: :param cannot_threshold: :param normalized: :param weighted: :param weighted_type: :param alpha: :return: """ class_num = len(np.unique(labels[-1])) must_consistencies = [] cannot_consistencies = [] clustering_weights = [] cluster_level_weights = [] k_value = [] for label in labels[0:-5]: must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must') cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot') if weighted: clustering_weights.append( Metrics.consistency(label, mlset, nlset, cons_type=weighted_type)) cluster_level_weights.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=weighted_type)) must_consistencies.append(must_cons) cannot_consistencies.append(cannot_cons) k_value.append(len(np.unique(label))) if normalized: scaler = preprocessing.MinMaxScaler() must_consistencies = scaler.fit_transform( np.array(must_consistencies).reshape(-1, 1)).ravel() cannot_consistencies = scaler.fit_transform( np.array(cannot_consistencies).reshape(-1, 1)).ravel() idx = np.logical_and(must_consistencies >= must_threshold, cannot_consistencies >= cannot_threshold) selected_labels = labels[0:-5][idx] k_value = np.array(k_value)[idx] logger.debug('[Consistency] Start consensus...shape=' + str(selected_labels.shape)) if selected_labels.shape[0] == 0: logger.debug('[Consistency] No clusterings are selected. Out.') return logger.debug('[Consistency] Average k is ' + str(np.mean(k_value))) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels, N_clusters_max=class_num) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) return
# subd = sub.feature_sampling(d, 2000) # print d.shape # print subd.shape # data_selected, data_unselected, \ # target_selected, target_unselected = train_test_split(d, t, # train_size=500, # random_state=154) # print data_selected # print data_unselected # print target_selected # print target_unselected # print d # ml, cl = io.read_constraints('Constraints/Wap_constraints_2n.txt') # ml, cl = io.read_constraints('Constraints/k1b_constraints_2n.txt') ml, cl = io.read_constraints('Constraints/waveform_constraints_half_n.txt') print metrics.consistency(t, ml, cl) # e2cp = cc.E2CP(data=d, ml=ml, cl=cl, n_clusters=6) # t1 = time.clock() # e2cp.fit_constrained() # t2 = time.clock() # print t # print np.unique(t) # print metrics.normalized_max_mutual_info_score(t, e2cp.labels) # print (t2 - t1) # t1 = time.clock() label = eck.cop_kmeans_wrapper(d, 3, ml, cl) # t2 = time.clock() # km = cluster.KMeans(n_clusters=20) # km.fit(d) print metrics.normalized_max_mutual_info_score(t, label) # print (t2 - t1)
def do_new_weighted_ensemble_for_library( library_folder, library_name, class_num, target, constraint_file, logger, gammas, internals=None, cons_type='both', ensemble_method=_default_ensemble_method, scale=False): """ :param library_folder: :param library_name: :param class_num: :param target: :param constraint_file: :param logger: :param alphas: :param cons_type: :param ensemble_method :return: """ logger.debug( '===========================================================================================' ) logger.debug('-----------------New ver Weighted Ensemble for library:' + str(library_name) + '---------------') logger.debug('-----------------Weight type = ' + cons_type + '-------------------------------------------') logger.debug('-----------------Scale type = ' + str(scale) + '-------------------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) # if the library is not pure, i.e, ensemble results and targets are also included. # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels) if 'pure' not in library_name: labels = labels[0:-5] mlset, nlset = io_func.read_constraints(constraint_file) n_instances = labels.shape[1] if cons_type == 'both': n_constraints = len(mlset) + len(nlset) else: n_constraints = len(mlset) if internals is None: internals = _build_pesudo_internal(labels) # get cluster/clustering level weights # constraints in each cluster of all clusterings are also obtained to get g_gamma con_per_cluster = [] constraints_num = [] con_clustering = [] cluster_time_sum = 0.0 clustering_time_sum = 0.0 for label in labels: t1 = time.clock() weight, cluster_cons_num = Metrics.consistency_per_cluster_efficient( label, mlset, nlset, cons_type=cons_type) con_per_cluster.append(weight) constraints_num.append(cluster_cons_num) t2 = time.clock() cluster_time_sum += (t2 - t1) for label in labels: t1 = time.clock() con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) t2 = time.clock() clustering_time_sum += (t2 - t1) print 'library size=' + str(labels.shape[0]) print 'cluster avg=' + str(cluster_time_sum / labels.shape[0]) print 'clustering avg=' + str(clustering_time_sum / labels.shape[0]) if scale: scaler = preprocessing.MinMaxScaler() con_clustering = scaler.fit_transform(np.array(con_clustering)) nmis = [] for gamma in gammas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) cur_g_gamma = get_g_gamma(constraints_num, labels, n_constraints, n_instances, gamma) cur_nmis = [] for method in ensemble_method: ensemble_labels = _ensemble_method[method]( labels, N_clusters_max=class_num, weighted=True, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=cur_g_gamma, new_formula=True, internal=internals) # ensemble_labels = _ensemble_method[method](labels, N_clusters_max=class_num, # weighted=True, clustering_weights=con_clustering, # cluster_level_weights=con_per_cluster, alpha=cur_g_gamma, # new_formula=True, internal=internals, ml=mlset, cl=nlset) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) logger.debug(method + ' gamma=' + str(gamma) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------' ) logger.debug( '===========================================================================================' ) return nmis
def do_7th_weighted_ensemble_for_library( library_folder, library_name, class_num, target, constraint_file, logger, alphas, internals, cons_type='both', ensemble_method=_default_ensemble_method, scale=False): """ :param library_folder: :param library_name: :param class_num: :param target: :param constraint_file: :param logger: :param alphas: :param cons_type: :param ensemble_method :return: """ logger.debug( '===========================================================================================' ) logger.debug('-----------------New Weighted Ensemble for library:' + str(library_name) + '-------------------') logger.debug('-----------------Weight type = ' + cons_type + '-------------------------------------------') logger.debug('-----------------Scale type = ' + str(scale) + '-------------------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) k_values = [] expected_cons = {} # if the library is not pure, i.e, ensemble results and targets are also included. # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels) if 'pure' not in library_name: labels = labels[0:-5] mlset, nlset = io_func.read_constraints(constraint_file) # get cluster/clustering level weights con_per_cluster = [] con_clustering = [] for label in labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=cons_type)) for label in labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) k_values.append(len(np.unique(label))) k_values = np.array(k_values, dtype=int) possible_k = np.unique(k_values) cons = np.array(con_clustering) for k in possible_k: mean_value = np.mean(cons[k_values == k]) if mean_value == 0: mean_value = 1 expected_cons[k] = mean_value for i in range(0, labels.shape[0]): con_clustering[i] /= expected_cons[k_values[i]] con_clustering[i] *= internals[i] if scale: scaler = preprocessing.MinMaxScaler() con_clustering = scaler.fit_transform(np.array(con_clustering)) nmis = [] for alpha in alphas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) cur_nmis = [] for method in ensemble_method: ensemble_labels = _ensemble_method[method]( labels, N_clusters_max=class_num, weighted=True, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------' ) logger.debug( '===========================================================================================' ) return nmis