Example #1
0
def main():
    datamat,datalabels = loadDataset("../dataset/lung-cancer.data")
    print 'data ready'

    sampledData, remainedData, sampledIndex, remainedIndex = data_sample(datamat,1,10)
    print 'sampledData ready'
    pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'kmeans')
    print 'kmeans end'
    pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'ward')
    print 'ward end'
    pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'complete')
    print 'complete end'
    pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'average')
    print 'average end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()

    pop = np.array(pop)
    hypergraph_adjacency = build_hypergraph_adjacency(pop)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name, pop, verbose=True, N_clusters_max=10)
    nmi = normalized_mutual_info_score(datalabels, consensus_clustering_labels)
    ari = adjusted_rand_score(datalabels, consensus_clustering_labels)
    print('nmi值: ')
    print(nmi)
    print('ari值: ')
    print(ari)
Example #2
0
def ensemble_crossover(population, index_arr):
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()
    individuals = []  #用于交叉的父代个体的集合
    clusters_num = []
    # print int(round(len(population)*0.25))
    for i in range(20):
        individuals.append(tournament(population, index_arr))  #二进制锦标赛法选择出父代个体
    individuals = np.array(individuals)
    for j in range(len(individuals)):  #交叉产生的聚类簇的范围
        individual = individuals[j]
        aa = len(set(individual))
        clusters_num.append(aa)
    sort_clustersNum = sorted(
        clusters_num)  #sort对原list操作,但这里是set不能用sort(),只有用sorted()
    clusters_max = random.randint(sort_clustersNum[0],
                                  sort_clustersNum[-1] + 1)
    hypergraph_adjacency = build_hypergraph_adjacency(individuals)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                          individuals,
                                          verbose=True,
                                          N_clusters_max=clusters_max)
    ind_ensemble = creator.Individual(consensus_clustering_labels)
    print('交叉后的结果是:%s' % ind_ensemble)
    return ind_ensemble
def cluster_ensembles(cluster_runs, verbose, N_clusters_max, method):
    hdf5_file_name = 'tmp_graph'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()
    hypergraph_adjacency = build_hypergraph_adjacency(cluster_runs)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    cluster_ensemble = method(hdf5_file_name, cluster_runs, verbose,
                              N_clusters_max)
    score = ceEvalMutual(cluster_runs, cluster_ensemble, verbose)
    return score, cluster_ensemble
Example #4
0
def all_ensemble(population, k):
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()
    pop = []
    for i in range(len(population)):
        ind = []
        ind.extend(population[i])
        pop.append(ind)
    pop = np.array(pop)
    hypergraph_adjacency = build_hypergraph_adjacency(pop)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                          pop,
                                          verbose=True,
                                          N_clusters_max=k + 2)
    return consensus_clustering_labels
Example #5
0
def moclenew(datamat):
    # datamat,datalabels = loadDataset("../dataset/glass.data")
    print 'data ready'
    pop_kmeans = ini_population(datamat, 'kmeans', 10)
    print 'kmeans end'
    pop_ward = ini_population(datamat, 'ward', 10)
    print 'ward end'
    pop_complete = ini_population(datamat, 'complete', 10)
    print 'complete end'
    pop_average = ini_population(datamat, 'average', 10)
    print 'average end'
    # pop_spc = ini_population(datamat, 'spc', 1)
    # print 'spc end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_complete)
    pop.extend(pop_average)
    # pop.extend(pop_spc)
    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半
    # dataLen = len(datamat)
    # distances_matrix = zeros((dataLen, dataLen))
    # for datai in range(dataLen):
    #     for dataj in range(datai+1,dataLen):
    #         distances_matrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj])
    distances_matrix = pairwise_distances(datamat,
                                          metric='euclidean')  # 数据集中数据点两两之间的距离
    print "数据点距离矩阵计算完毕"
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    for ind in invalid_ind:
        euDistance, eu_connect = mocle_index(datamat, distances_matrix, ind)
        fitnesses = (euDistance, eu_connect)
        ind.fitness.values = fitnesses
    # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)
    #
    # for ind, fit in zip(invalid_ind, fitnesses):
    #     ind.fitness.values = fit

    # population = toolbox.select(population, len(population))
    popeliteLen = len(population)
    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, popeliteLen)
        # Vary the population
        # parentSpring = tools.selTournamentDCD(popElite, popeliteLen)
        # parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover
        popcrossover = toolbox.select(population, 2)

        k1 = len(list(set(popcrossover[0])))
        k2 = len(list(set(popcrossover[1])))
        if k1 <= k2:
            k = random.randint(k1, k2 + 1)
        else:
            k = random.randint(k2, k1 + 1)
        # 其他聚类集成算子
        hdf5_file_name = './Cluster_Ensembles.h5'
        fileh = tables.open_file(hdf5_file_name, 'w')
        fileh.create_group(fileh.root, 'consensus_group')
        fileh.close()
        popcrossover = np.array(popcrossover)
        hypergraph_adjacency = build_hypergraph_adjacency(popcrossover)
        store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
        resultList = CE.MCLA(hdf5_file_name,
                             popcrossover,
                             verbose=True,
                             N_clusters_max=k)
        ind_ensemble = creator.Individual(resultList)
        newoffspring.append(ind_ensemble)

        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        for ind1 in invalid_ind:
            euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix,
                                                   ind1)
            fitnesses1 = (euDistance1, eu_connect1)
            ind1.fitness.values = fitnesses1

        # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别
        #
        # for ind, fit in zip(invalid_ind, fitnesses):
        #     ind.fitness.values = fit

        # Chossing a population for the next generation
        # population = toolbox.select(popElite + newoffspring, popeliteLen)
        population = popElite + newoffspring
    result1 = toolbox.nondominated(population, len(population))
    nondominated_result = result1[0]
    final_result, pbmValue = computePBM(datamat, nondominated_result)
    return final_result, pbmValue
Example #6
0
def main():
    # init_population,init_ari,datamat,datalabels = ini_Cluster(kNumber=6) #多种聚类算法产生初始种群
    datamat, datalabels = loadDataset("../dataset/soybean-small.data")
    print 'data ready'

    pop_kmeans = initialMultiRun(datamat, 10, 'kmeans')
    print 'kmeans end'
    pop_ward = initialMultiRun(datamat, 10, 'ward')
    print 'ward end'
    pop_complete = initialMultiRun(datamat, 10, 'complete')
    print 'complete end'
    pop_average = initialMultiRun(datamat, 10, 'average')
    print 'average end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)

    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate,
                            tile(datamat, (len(invalid_ind), 1, 1)),
                            tile(population, (len(invalid_ind), 1, 1)),
                            invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    population = toolbox.select(population, len(population))

    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, int(round(
            len(population) * 0.5)))  #top half from population

        # Vary the population
        parentSpring = tools.selTournamentDCD(population, len(population))
        parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover
        for indiv1, indiv2 in zip(parentSpring[::2], parentSpring[1::2]):
            randNum = random.random()  # generate a random number from 0 to 1
            if randNum < 0.8:
                toolbox.mate(indiv1, indiv2)
                toolbox.mutate(indiv1)
                toolbox.mutate(indiv2)
                del indiv1.fitness.values, indiv2.fitness.values
                newoffspring.append(indiv1)
                newoffspring.append(indiv2)
            else:
                hdf5_file_name = './Cluster_Ensembles.h5'
                fileh = tables.open_file(hdf5_file_name, 'w')
                fileh.create_group(fileh.root, 'consensus_group')
                fileh.close()
                individuals = []
                individuals.append(indiv1)
                individuals.append(indiv2)
                individuals = np.array(individuals)
                hypergraph_adjacency = build_hypergraph_adjacency(individuals)
                store_hypergraph_adjacency(hypergraph_adjacency,
                                           hdf5_file_name)
                consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                                      individuals,
                                                      verbose=True,
                                                      N_clusters_max=10)
                ind_ensemble = creator.Individual(consensus_clustering_labels)
                newoffspring.append(ind_ensemble)

        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate,
                                tile(datamat, (len(invalid_ind), 1, 1)),
                                tile(newoffspring, (len(invalid_ind), 1, 1)),
                                invalid_ind)  #这里只用了未经处理的数据,没有用到真实类别
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Chossing a population for the next generation
        population = toolbox.select(popElite + newoffspring, len(population))
    result1 = toolbox.nondominated(population, len(population))
    print len(result1)
    print result1
    print len(result1[0])
    print result1[0]
    print 'ari值'
    ari_arr = []
    max_ari = -inf
    for ind in result1[0]:
        ari = adjusted_rand_score(datalabels, ind)
        ari_arr.append(ari)
        if ari > max_ari:
            max_ari = ari
    print ari_arr
    print max_ari
    nmi_arr = []
    max_nmi = -inf
    print 'nmi值'
    for ind in result1[0]:
        nmi = normalized_mutual_info_score(datalabels, ind)
        nmi_arr.append(nmi)
        if nmi > max_nmi:
            max_nmi = nmi
    print nmi_arr
    print max_nmi
Example #7
0
def multirun(datasetName):
    # datamat,datalabels = loadDataset("../dataset/glass.data")
    path = '../dataset/' + datasetName
    datamat, datalabels = loadDataset(path)
    print 'data ready'
    sampledData, remainedData, sampledIndex, remainedIndex = data_sample(
        datamat, 1, 2)
    print 'sampledData ready'

    pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                      'kmeans')
    print 'kmeans end'
    max_nmi1 = -inf
    for ind1 in pop_kmeans:
        nmi1 = normalized_mutual_info_score(datalabels, ind1)
        if nmi1 > max_nmi1:
            max_nmi1 = nmi1
    print '初始kmeans最大nmi为%s' % max_nmi1
    pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                    'ward')
    print 'ward end'
    max_nmi2 = -inf
    for ind2 in pop_ward:
        nmi2 = normalized_mutual_info_score(datalabels, ind2)
        if nmi2 > max_nmi2:
            max_nmi2 = nmi2
    print '初始ward最大nmi为%s' % max_nmi2
    pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                        'complete')
    print 'complete end'
    max_nmi3 = -inf
    for ind3 in pop_complete:
        nmi3 = normalized_mutual_info_score(datalabels, ind3)
        if nmi3 > max_nmi3:
            max_nmi3 = nmi3
    print '初始complete最大nmi为%s' % max_nmi3
    pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                       'average')
    print 'average end'
    max_nmi4 = -inf
    for ind4 in pop_average:
        nmi4 = normalized_mutual_info_score(datalabels, ind4)
        if nmi4 > max_nmi4:
            max_nmi4 = nmi4
    print '初始average最大nmi为%s' % max_nmi4
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)

    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半
    # dataLen = len(datamat)
    # eudataPointMatrix = zeros((dataLen, dataLen))
    # for datai in range(dataLen):
    #     for dataj in range(datai+1,dataLen):
    #         eudataPointMatrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj])
    distances_matrix = pairwise_distances(datamat,
                                          metric='euclidean')  # 数据集中数据点两两之间的距离
    print "数据点距离矩阵计算完毕"
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)
    # for ind, fit in zip(invalid_ind, fitnesses):
    #     ind.fitness.values = fit
    for ind1 in invalid_ind:
        euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix, ind1)
        fitnesses1 = (euDistance1, eu_connect1)
        ind1.fitness.values = fitnesses1
    # population = toolbox.select(population, len(population))
    popeliteLen = len(population)
    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, popeliteLen)
        # Vary the population
        # parentSpring = tools.selTournamentDCD(popElite, popeliteLen)
        # parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover

        subpopArr = getSubPop(popElite)
        count = 0  # 计数增加几个新个体用
        for subpop in subpopArr:
            #dsce做交叉算子
            # a1=0.6
            # a2=0.5
            # transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr = transformation(datamat, subpop)
            # similiarMatrix = measureSimilarity(transMatrix, popClusterArr_3, popClusterArr_2,
            #                                                   clusterNumArr, datamat, a1=a1)
            # dictCownP = assign(similiarMatrix, a2)
            # resultList = resultTransform(dictCownP, datamat)
            #其他聚类集成算子
            hdf5_file_name = './Cluster_Ensembles.h5'
            fileh = tables.open_file(hdf5_file_name, 'w')
            fileh.create_group(fileh.root, 'consensus_group')
            fileh.close()
            subpop = np.array(subpop)
            hypergraph_adjacency = build_hypergraph_adjacency(subpop)
            store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
            resultList = CE.CSPA(hdf5_file_name,
                                 subpop,
                                 verbose=True,
                                 N_clusters_max=3)
            resultList = list(resultList)

            clu = list(set(resultList))
            clulen = len(clu)
            actual_resultList = []

            if clulen > 1:
                ind_ensemble = creator.Individual(resultList)
                newoffspring.append(ind_ensemble)
                actual_resultList = resultList  #只有簇的数量不是1才会有子个体
                count += 1
            if actual_resultList:
                predicted_clusternum = len(set(actual_resultList))
                ind_new = KMeans(
                    n_clusters=predicted_clusternum).fit_predict(datamat)
                ind_new_tran = creator.Individual(ind_new)
                newoffspring.append(ind_new_tran)
                count += 1
        print "这一代增加里%s个个体" % count
        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别
        # for ind, fit in zip(invalid_ind, fitnesses):
        #     ind.fitness.values = fit

        for ind1 in invalid_ind:
            euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix,
                                                   ind1)
            fitnesses1 = (euDistance1, eu_connect1)
            ind1.fitness.values = fitnesses1
        # Chossing a population for the next generation
        # population = toolbox.select(popElite + newoffspring, popeliteLen)
        population = popElite + newoffspring
    result1 = toolbox.nondominated(population, len(population))
    ari_arr = []
    max_ari = -inf
    for ind in result1[0]:
        ari = adjusted_rand_score(datalabels, ind)
        ari_arr.append(ari)
        if ari > max_ari:
            max_ari = ari
    nmi_arr = []
    max_nmi = -inf
    print 'nmi值'
    for ind in result1[0]:
        nmi = normalized_mutual_info_score(datalabels, ind)
        nmi_arr.append(nmi)
        if nmi > max_nmi:
            max_nmi = nmi
    print '最大nmi值为:%s' % max_nmi
    print nmi_arr
    return max_nmi, max_ari