def do_propagation_ensemble(library_folder,
                            library_name,
                            class_num,
                            target,
                            constraint_file,
                            logger,
                            alphas,
                            have_zero=True,
                            ensemble_method=_default_ensemble_method):
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------Propagation Ensemble for library:' +
                 str(library_name) + '----------------')
    logger.debug('-----------------Have zero type = ' + str(have_zero) +
                 '-----------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)

    ml, cl = io_func.read_constraints(constraint_file)

    hyperedges = ce.build_hypergraph_adjacency(labels)
    hyperedges = hyperedges.transpose()

    coas_matrix = hyperedges.dot(hyperedges.transpose())
    coas_matrix = np.squeeze(np.asarray(coas_matrix.todense()))
    coas_matrix = coas_matrix.astype(np.float32)
    coas_matrix /= np.max(coas_matrix)

    print coas_matrix

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        propagated_coas_matrix = propagation_on_coassociation_matrix(
            coas_matrix, ml, cl, alpha)
        cur_nmis = []
        for method in ensemble_method:
            ensemble_label = _ensemble_method[method](propagated_coas_matrix,
                                                      labels.shape[0],
                                                      class_num)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_label, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
def batch_do_consistency_selection_for_library(library_folder,
                                               library_name,
                                               constraint_file,
                                               logger,
                                               threshold_tuples,
                                               normalized=True,
                                               weighted=False,
                                               weighted_type='both',
                                               alpha=1):
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------Consistency Selection Ensemble:' +
                 str(library_name) + '----------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')
    mlset, nlset = io_func.read_constraints(constraint_file)
    for c_tuple in threshold_tuples:
        logger.debug('------->>>>Results for Must>' + str(c_tuple[0]) +
                     ',Cannot>' + str(c_tuple[1]) + ' <<<<-------')
        consistency_selection_ensemble_for_library(library_folder,
                                                   library_name,
                                                   mlset,
                                                   nlset,
                                                   logger,
                                                   c_tuple[0],
                                                   c_tuple[1],
                                                   normalized=normalized,
                                                   weighted=weighted,
                                                   weighted_type=weighted_type,
                                                   alpha=alpha)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return
Exemple #3
0
import constrained_methods.constrained_clustering as cc
import utils.load_dataset as ld
import utils.io_func as io
import time
import evaluation.Metrics as Metrics

data, target = ld.load_mnist_4000()
print data.shape
data = data.astype(float)
ml, cl = io.read_constraints('Constraints/MNIST4000_diff_n_1.txt')
t1 = time.clock()
e2cp = cc.E2CP(data=data, ml=ml, cl=cl, n_clusters=10)
t2 = time.clock()
e2cp.fit_constrained()
print e2cp.labels
print Metrics.normalized_max_mutual_info_score(target, e2cp.labels)
print t2 - t1
Exemple #4
0
# import member_generation.subspace as sub
# subd = sub.feature_sampling(d, 2000)
# print d.shape
# print subd.shape
# data_selected, data_unselected, \
# target_selected, target_unselected = train_test_split(d, t,
#                                                       train_size=500,
#                                                       random_state=154)
# print data_selected
# print data_unselected
# print target_selected
# print target_unselected
# print d
# ml, cl = io.read_constraints('Constraints/Wap_constraints_2n.txt')
# ml, cl = io.read_constraints('Constraints/k1b_constraints_2n.txt')
ml, cl = io.read_constraints('Constraints/waveform_constraints_half_n.txt')
print metrics.consistency(t, ml, cl)
# e2cp = cc.E2CP(data=d, ml=ml, cl=cl, n_clusters=6)
# t1 = time.clock()
# e2cp.fit_constrained()
# t2 = time.clock()
# print t
# print np.unique(t)
# print metrics.normalized_max_mutual_info_score(t, e2cp.labels)
# print (t2 - t1)
# t1 = time.clock()
label = eck.cop_kmeans_wrapper(d, 3, ml, cl)
# t2 = time.clock()
# km = cluster.KMeans(n_clusters=20)
# km.fit(d)
print metrics.normalized_max_mutual_info_score(t, label)
Exemple #5
0
def comparison_methods(dataset_name,
                       constraints_files=None,
                       additional_postfix='',
                       eval_method=None):
    """
    get the performance of comparison methods.

    Parameters
    ----------
    :param dataset_name:
    :param constraints_files:
    :param additional_postfix:
    :param eval_method:
    """
    filename = _default_eval_path + dataset_name + '_' + time.strftime(
        '%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) + '.csv'
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        data, targets = exd.dataset[dataset_name]['data']()
        data = data.astype(np.double)
        k = exd.dataset[dataset_name]['k']
        km = cluster.KMeans(n_clusters=k)
        km.fit(data)
        writer.writerow([
            'KMeans',
            str(metrics.normalized_max_mutual_info_score(targets, km.labels_))
        ])
        eval_methods = _default_eval_methods if eval_method is None else eval_method
        if constraints_files is None:
            filenames = _get_default_constraints_files(
                dataset_name, _default_constraints_postfix, additional_postfix)
        else:
            filenames = _get_default_constraints_files(dataset_name,
                                                       constraints_files,
                                                       additional_postfix)
        for filename in filenames:
            ml, cl = io_func.read_constraints(_default_constraints_folder +
                                              filename + '.txt')
            for method in eval_methods:
                if method == 'Cop_KMeans':
                    result = _constrained_methods[method](data, k, ml, cl)
                    writer.writerow([
                        filename + '_Cop_KMeans',
                        str(
                            metrics.normalized_max_mutual_info_score(
                                targets, result))
                    ])
                elif method == 'E2CP':
                    e2cp = _constrained_methods[method](data=data,
                                                        ml=ml,
                                                        cl=cl,
                                                        n_clusters=k)
                    e2cp.fit_constrained()
                    result = e2cp.labels
                    writer.writerow([
                        filename + '_E2CP',
                        str(
                            metrics.normalized_max_mutual_info_score(
                                targets, result))
                    ])
    return
def generate_closure_constraints_with_portion(dataset_name,
                                              must_count=0,
                                              cannot_count=0,
                                              informative=False):
    """
    generate transitive-closure constraints
    the number of must-link constraints generated in
    different classes is decided by their portion to all samples

    Parameters
    ----------
    :param targets: real labels of given data set
    :param dataset_name:
    :param must_count: number of must-link constraints to generate
    :param cannot_count: number of cannot-link constraints to generate
    :param informative:

    Returns
    -------
    :return: must-link constraints and cannot-link constraints in 2 lists.
    """
    data, targets = exd.dataset[dataset_name]['data']()
    data_len = len(targets)
    clusters = np.unique(np.array(targets))
    n_must_link = [0] * len(clusters)
    must_link = []
    cannot_link = []
    ml_graph = dict()
    cl_graph = dict()

    if informative:
        if os.path.isfile(_default_constraints_folder + dataset_name +
                          '_informative_constraints.txt'):
            print 'informative constraints already existed.'
            _, informative_cl = io_func.read_constraints(
                _default_constraints_folder + dataset_name +
                '_informative_constraints.txt')
        else:
            print 'informative constraints not exist, generating...'
            ics.generate_informative_cl_set(dataset_name)
            _, informative_cl = io_func.read_constraints(
                _default_constraints_folder + dataset_name +
                '_informative_constraints.txt')
        informative_len = len(informative_cl)

    for x in range(data_len):
        ml_graph[x] = set()
        cl_graph[x] = set()

    for cluster in clusters:
        n_must_link[cluster] = int(
            len(targets[targets == cluster]) / float(data_len) * must_count)

    # print n_must_link

    def add_both(d, i, j, ls):
        d[i].add(j)
        d[j].add(i)
        if i > j:
            tmp = i
            i = j
            j = tmp
        # make the first sample to be the smaller one in order to filter the duplicates
        ls.append((i, j))

    for cluster in clusters:
        all_samples = np.where(targets == cluster)
        all_samples = np.squeeze(all_samples)
        # print all_samples
        cur_count = 0
        while cur_count < n_must_link[cluster]:
            selected_sample = np.random.choice(all_samples, 2)
            if selected_sample[0] > selected_sample[1]:
                temp = selected_sample[0]
                selected_sample[0] = selected_sample[1]
                selected_sample[1] = temp
            if (selected_sample[0], selected_sample[1]) in must_link:
                continue
            else:
                add_both(ml_graph, selected_sample[0], selected_sample[1],
                         must_link)
                cur_count += 1
                for x in ml_graph[selected_sample[0]]:
                    if x not in ml_graph[
                            selected_sample[1]] and x != selected_sample[1]:
                        add_both(ml_graph, x, selected_sample[1], must_link)
                        cur_count += 1
                for y in ml_graph[selected_sample[1]]:
                    if y not in ml_graph[
                            selected_sample[0]] and y != selected_sample[0]:
                        add_both(ml_graph, y, selected_sample[0], must_link)
                        cur_count += 1

    cur_count = 0
    while cur_count < cannot_count:
        # choose sample randomly
        if informative:
            cl_tuple = informative_cl[rand.randint(0, informative_len - 1)]
            samp1 = cl_tuple[0]
            samp2 = cl_tuple[1]
        else:
            samp1 = rand.randint(0, data_len - 1)
            samp2 = rand.randint(0, data_len - 1)

        # we don't accept same sample to be the constraint
        if samp1 == samp2 or targets[samp1] == targets[samp2]:
            continue

        if samp1 > samp2:
            temp = samp1
            samp1 = samp2
            samp2 = temp

        # filter the duplicates
        # if they are in the same class, append to the must-link set, or otherwise, the cannot-link set
        if (samp1, samp2) in must_link or (samp1, samp2) in cannot_link:
            continue
        else:
            add_both(cl_graph, samp1, samp2, cannot_link)
            cur_count += 1
            for x in ml_graph[samp1]:
                if x not in cl_graph[samp2]:
                    add_both(cl_graph, x, samp2, cannot_link)
                    cur_count += 1
            for y in ml_graph[samp2]:
                if y not in cl_graph[samp1]:
                    add_both(cl_graph, y, samp1, cannot_link)
                    cur_count += 1

    return must_link, cannot_link, ml_graph, cl_graph
Exemple #7
0
def do_new_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        gammas,
        internals=None,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New ver Weighted Ensemble for library:' +
                 str(library_name) + '---------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)
    n_instances = labels.shape[1]
    if cons_type == 'both':
        n_constraints = len(mlset) + len(nlset)
    else:
        n_constraints = len(mlset)
    if internals is None:
        internals = _build_pesudo_internal(labels)

    # get cluster/clustering level weights
    # constraints in each cluster of all clusterings are also obtained to get g_gamma
    con_per_cluster = []
    constraints_num = []
    con_clustering = []
    cluster_time_sum = 0.0
    clustering_time_sum = 0.0
    for label in labels:
        t1 = time.clock()
        weight, cluster_cons_num = Metrics.consistency_per_cluster_efficient(
            label, mlset, nlset, cons_type=cons_type)
        con_per_cluster.append(weight)
        constraints_num.append(cluster_cons_num)
        t2 = time.clock()
        cluster_time_sum += (t2 - t1)
    for label in labels:
        t1 = time.clock()
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        t2 = time.clock()
        clustering_time_sum += (t2 - t1)

    print 'library size=' + str(labels.shape[0])
    print 'cluster avg=' + str(cluster_time_sum / labels.shape[0])
    print 'clustering avg=' + str(clustering_time_sum / labels.shape[0])

    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for gamma in gammas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_g_gamma = get_g_gamma(constraints_num, labels, n_constraints,
                                  n_instances, gamma)
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=cur_g_gamma,
                new_formula=True,
                internal=internals)
            # ensemble_labels = _ensemble_method[method](labels, N_clusters_max=class_num,
            #                                            weighted=True, clustering_weights=con_clustering,
            #                                            cluster_level_weights=con_per_cluster, alpha=cur_g_gamma,
            #                                            new_formula=True, internal=internals, ml=mlset, cl=nlset)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' gamma=' + str(gamma) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
Exemple #8
0
def do_7th_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        alphas,
        internals,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New Weighted Ensemble for library:' +
                 str(library_name) + '-------------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)
    k_values = []
    expected_cons = {}

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)

    # get cluster/clustering level weights
    con_per_cluster = []
    con_clustering = []
    for label in labels:
        con_per_cluster.append(
            Metrics.consistency_per_cluster(label,
                                            mlset,
                                            nlset,
                                            cons_type=cons_type))
    for label in labels:
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    cons = np.array(con_clustering)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        if mean_value == 0:
            mean_value = 1
        expected_cons[k] = mean_value
    for i in range(0, labels.shape[0]):
        con_clustering[i] /= expected_cons[k_values[i]]
        con_clustering[i] *= internals[i]
    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=alpha)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
def generate_library(data, target, dataset_name, n_members, class_num,
                     n_cluster_lower_bound=0, n_cluster_upper_bound=0,
                     feature_sampling=1.0, sample_sampling=0.7,
                     feature_sampling_lower_bound=0.05, sample_sampling_lower_bound=0.1,
                     f_stable_sample=True, s_stable_sample=True,
                     constraints_file=None, sampling_method='FSRSNC', verbose=True, path=_default_result_path,
                     metric='nid', manifold_type='MDS', subfolder=True,
                     generate_only=True):
    """
    generate a single library of ensemble member.

    Parameters
    ----------
    :param data: dataset in a ndarray
    :param target: target in a ndarray or list
    :param dataset_name: name of dataset
    :param n_members: #clusters
    :param class_num: #real_class
    :param n_cluster_lower_bound: lower bound of k
    :param n_cluster_upper_bound: upper bound of k
    :param feature_sampling: fixed sampling rate of feature, or upper bound if not stable
    :param sample_sampling:  fixed sampling rate of instances, or upper bound if not stable
    :param feature_sampling_lower_bound: lower bound of sampling rate of feature, only available if not stable
    :param sample_sampling_lower_bound: lower bound of sampling rate of instance, only available if not stable
    :param f_stable_sample: stable feature sampling or not
    :param s_stable_sample: stable instance sampling or not
    :param constraints_file: name of constraint file, only available when
    :param sampling_method: 'FSRSNC' and 'FSRSNN' supported
    :param verbose: print debug info.
    :param path: path to store the library
    :param metric: used for visualization only
    :param manifold_type: used for visualization only
    :param subfolder: save library in a separated sub-folder or not.

    Return
    ------
    :return: name of the library generated (the library itself will be stored as a file)
    """
    print('start generating library for dataset:' + dataset_name)

    # make sure that path to store the library existing
    if not os.path.isdir(path):
        os.mkdir(path)
    if subfolder:
        savepath = path + dataset_name + '/'
        if not os.path.isdir(savepath):
            os.mkdir(savepath)
    else:
        savepath = path

    # we set the range of cluster number to [k, 10k] if not defined
    if n_cluster_lower_bound == 0 or n_cluster_upper_bound == 0:
        n_cluster_lower_bound = class_num
        n_cluster_upper_bound = class_num * 10

    # get sampling method, if not exist, it will raise a exception
    if sampling_method in _sampling_methods.keys():
        is_constrained = False
    elif sampling_method in _constrained_methods.keys():
        is_constrained = True
    else:
        raise ValueError('ensemble generation : Method should be set properly.')

    # read constraints file if existing
    if constraints_file is not None:
        mlset, nlset = io_func.read_constraints(constraints_file)
    else:
        if is_constrained:
            raise Exception('ensemble generation : Constrained Member must be with a constraints file.')
        constraints_file = ''
        mlset = []
        nlset = []

    # lower bound of sampling rate (use only if 'stable' set to be false)
    if feature_sampling_lower_bound > feature_sampling:
        feature_sampling_lower_bound = feature_sampling / 2
    if sample_sampling_lower_bound > sample_sampling:
        sample_sampling_lower_bound = sample_sampling / 2

    # there should be at least 2 clusters in the clustering
    if n_cluster_lower_bound < 2:
        n_cluster_lower_bound = 2
    if n_cluster_upper_bound < n_cluster_lower_bound:
        n_cluster_upper_bound = n_cluster_lower_bound

    # path and filename to write the file
    filename = _get_file_name(dataset_name, n_cluster_lower_bound, n_cluster_upper_bound, feature_sampling,
                              feature_sampling_lower_bound, sample_sampling, sample_sampling_lower_bound, n_members,
                              f_stable_sample, s_stable_sample, sampling_method, is_constraint_method=is_constrained,
                              constraint_file=constraints_file)

    # we won't generate the library with same sampling rate and size if existing
    if os.path.isfile(savepath + filename + '.res'):
        print ('[Library Generation] : library already exists.')
        return filename+'.res'
    elif os.path.isfile(savepath + filename + '_pure.res'):
        print ('[Library Generation] : corresponding pure library already exists.')
        return filename+'_pure.res'

    tag = True

    # matrix to store clustering results
    mat = np.empty(data.shape[0])

    # generate ensemble members
    for i in range(0, n_members):
        # determine k randomly
        cluster_num = np.random.randint(n_cluster_lower_bound, n_cluster_upper_bound + 1)
        random_state = np.random.randint(0, _INT_MAX - 1)

        cur_feature_sampling = feature_sampling
        cur_sample_sampling = sample_sampling
        if not f_stable_sample:
            cur_feature_sampling = rand.uniform(feature_sampling_lower_bound, feature_sampling)
        if not s_stable_sample:
            cur_sample_sampling = rand.uniform(sample_sampling_lower_bound, sample_sampling)

        print('For this base clustering, cluster number is ' + str(cluster_num))
        # generate ensemble member by given method
        if sampling_method == 'Cop_KMeans':
            result = _constrained_methods[sampling_method](data, cluster_num, mlset, nlset)
        elif sampling_method == 'E2CP':
            e2cp = _constrained_methods[sampling_method](data=data, ml=mlset, cl=nlset, n_clusters=cluster_num)
            e2cp.fit_constrained()
            result = e2cp.labels
        else:
            result = _sampling_methods[sampling_method](data, target, r_clusters=cluster_num,
                                                        r_state=random_state, fsr=cur_feature_sampling,
                                                        ssr=cur_sample_sampling)
        # print diversity
        diver = Metrics.normalized_max_mutual_info_score(result, target)
        if verbose:
            print ('Base clustering' + str(i) + ' nmi_max between real labels = ' + str(diver))
        # stack the result into the matrix
        if tag:
            mat = np.array(result)
            mat = np.reshape(mat, (1, data.shape[0]))
            tag = False
        else:
            temp = np.array(result)
            temp = np.reshape(temp, (1, data.shape[0]))
            mat = np.vstack([mat, np.array(temp)])

    # change element type to int for consensus
    mat = mat.astype(int)

    if generate_only or is_constrained:
        np.savetxt(savepath + filename + '_pure' + '.res', mat, fmt='%d', delimiter=',')
        return filename+'_pure.res'

    # single k-means model, for comparison
    clf = cluster.KMeans(n_clusters=class_num)
    clf.fit(data)
    kmlabels = clf.labels_

    # do consensus
    labels_CSPA = ce.cluster_ensembles_CSPAONLY(mat, N_clusters_max=class_num)
    labels_HGPA = ce.cluster_ensembles_HGPAONLY(mat, N_clusters_max=class_num)
    labels_MCLA = ce.cluster_ensembles_MCLAONLY(mat, N_clusters_max=class_num)

    # put consensus results into the matrix
    mat = np.vstack([mat, np.reshape(kmlabels, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_CSPA, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_HGPA, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_MCLA, (1, data.shape[0]))])

    # put real labels into the matrix
    temp = np.reshape(target, (1, data.shape[0]))
    mat = np.vstack([mat, np.array(temp)])

    print ('Dataset ' + dataset_name + ', consensus finished, saving...')

    # write results to external file, use %d to keep integer part only
    np.savetxt(savepath + filename + '.res', mat, fmt='%d', delimiter=',')

    # print labels and diversities (between the real labels)
    nmi_CSPA = Metrics.normalized_max_mutual_info_score(labels_CSPA, target)
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(labels_HGPA, target)
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(labels_MCLA, target)
    print ('consensus NMI (CSPA) =' + str(nmi_CSPA))
    print ('consensus NMI (HGPA) =' + str(nmi_HGPA))
    print ('consensus NMI (MCLA) =' + str(nmi_MCLA))

    kmnmi = Metrics.normalized_max_mutual_info_score(kmlabels, target)
    print ('single-model diversity (K-means) =' + str(kmnmi))
    # save performances
    perf = np.array([nmi_CSPA, nmi_HGPA, nmi_MCLA, kmnmi])
    np.savetxt(savepath + filename + '_performance.txt', perf, fmt='%.6f', delimiter=',')

    if metric == 'diversity':
        distance_matrix = Metrics.diversityMatrix(mat)
        np.savetxt(savepath + filename + '_diversity.txt', distance_matrix, delimiter=',')
    else:
        distance_matrix = Metrics.NIDMatrix(mat)
        np.savetxt(savepath + filename + '_nid.txt', distance_matrix, delimiter=',')

    if manifold_type == 'MDS':
        # transform distance matrix into 2-d or 3-d coordinates to visualize
        mds2d = manifold.MDS(n_components=2, max_iter=10000, eps=1e-12, dissimilarity='precomputed')
        mds3d = manifold.MDS(n_components=3, max_iter=10000, eps=1e-12, dissimilarity='precomputed')
        pos2d = mds2d.fit(distance_matrix).embedding_
        pos3d = mds3d.fit(distance_matrix).embedding_
        np.savetxt(savepath + filename + '_mds2d.txt', pos2d, fmt="%.6f", delimiter=',')
        np.savetxt(savepath + filename + '_mds3d.txt', pos3d, fmt="%.6f", delimiter=',')

        # draw odm, k distribution and nmi distribution
        cv.plot_ordered_distance_matrix(distance_matrix, savepath + filename + '_original_distance.png',
                                        savepath + filename + '_odm.png')
        cv.plot_k_distribution(mat, pos2d, savepath + filename+'_k_distribution.png')
        cv.plot_nmi_max(mat, pos2d, savepath + filename + '_nmimax_distribution.png')

        # consistencies are calculated while constraints file exists.
        if constraints_file != '':
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_both.png',
                                consistency_type='both')
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_must.png',
                                consistency_type='must')
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_cannot.png',
                                consistency_type='cannot')
            cv.plt_consistency_corelation_with_k(mat, mlset, nlset, savepath + filename+'_normalized.png')
    return