def consistency_selection_ensemble(labels,
                                   mlset,
                                   nlset,
                                   logger,
                                   must_threshold,
                                   cannot_threshold,
                                   normalized=True,
                                   weighted=False,
                                   weighted_type='both',
                                   alpha=1):
    """
    do selection ensemble using must/cannot consistency as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param mlset:
    :param nlset:
    :param logger:
    :param must_threshold:
    :param cannot_threshold:
    :param normalized:
    :param weighted:
    :param weighted_type:
    :param alpha:
    :return:
    """
    class_num = len(np.unique(labels[-1]))
    must_consistencies = []
    cannot_consistencies = []
    clustering_weights = []
    cluster_level_weights = []
    k_value = []
    for label in labels[0:-5]:
        must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must')
        cannot_cons = Metrics.consistency(label,
                                          mlset,
                                          nlset,
                                          cons_type='cannot')
        if weighted:
            clustering_weights.append(
                Metrics.consistency(label,
                                    mlset,
                                    nlset,
                                    cons_type=weighted_type))
            cluster_level_weights.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=weighted_type))
        must_consistencies.append(must_cons)
        cannot_consistencies.append(cannot_cons)
        k_value.append(len(np.unique(label)))
    if normalized:
        scaler = preprocessing.MinMaxScaler()
        must_consistencies = scaler.fit_transform(
            np.array(must_consistencies).reshape(-1, 1)).ravel()
        cannot_consistencies = scaler.fit_transform(
            np.array(cannot_consistencies).reshape(-1, 1)).ravel()
    idx = np.logical_and(must_consistencies >= must_threshold,
                         cannot_consistencies >= cannot_threshold)
    selected_labels = labels[0:-5][idx]
    k_value = np.array(k_value)[idx]
    logger.debug('[Consistency] Start consensus...shape=' +
                 str(selected_labels.shape))
    if selected_labels.shape[0] == 0:
        logger.debug('[Consistency] No clusterings are selected. Out.')
        return
    logger.debug('[Consistency] Average k is ' + str(np.mean(k_value)))
    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels,
                                               N_clusters_max=class_num)
    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    return
def k_selection_ensemble(labels,
                         k_threshold,
                         logger,
                         weighted=False,
                         alpha=0,
                         mlset=None,
                         nlset=None,
                         ctype='both'):
    """
    do selection ensemble using k as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param k_threshold:
    :param logger:
    :param weighted: weighted version or not
    :param alpha: balance factor that control the importance of clustering/cluster
                  consistency in weights (weighted version only)
    :param mlset: cannot-link set (weighted version only)
    :param nlset: must-link set (weighted version only)
    :param ctype: type of consistency (weighted version only)
    :return:
    """
    k_value = []
    class_num = len(np.unique(labels[-1]))
    # select those clusterings that k larger than the threshold.
    for label in labels[0:-5]:
        k_value.append(len(np.unique(label)))
    k_value = np.array(k_value)
    idx = k_value.ravel() >= k_threshold
    selected_labels = labels[0:-5][idx]

    # weights
    con_per_cluster = []
    con_clustering = []
    if weighted:
        for label in selected_labels:
            con_per_cluster.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=ctype))
        for label in selected_labels:
            con_clustering.append(
                Metrics.consistency(label, mlset, nlset, cons_type=ctype))

    logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape))
    logger.debug('[K] Average k is ' + str(np.mean(k_value[idx])))
    if weighted:
        logger.debug('[K] weighted consensus, alpha=' + str(alpha))

    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)

    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    logger.debug('--------------------------------------------')
    return
Beispiel #3
0
def do_7th_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        alphas,
        internals,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New Weighted Ensemble for library:' +
                 str(library_name) + '-------------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)
    k_values = []
    expected_cons = {}

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)

    # get cluster/clustering level weights
    con_per_cluster = []
    con_clustering = []
    for label in labels:
        con_per_cluster.append(
            Metrics.consistency_per_cluster(label,
                                            mlset,
                                            nlset,
                                            cons_type=cons_type))
    for label in labels:
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    cons = np.array(con_clustering)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        if mean_value == 0:
            mean_value = 1
        expected_cons[k] = mean_value
    for i in range(0, labels.shape[0]):
        con_clustering[i] /= expected_cons[k_values[i]]
        con_clustering[i] *= internals[i]
    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=alpha)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis