def rsnn(sampledData, remainedData, sampledIndex, remainedIndex, singleName): predicted_labelAll = [] for i in range(len(sampledData)): # clusters = random.randint(min_clusters,max_clusters) clusters = random.randint(2, 11) # clusters = random.randint(2,11)#范围是[2,10] if singleName == 'kmeans': predicted_label = KMeans(n_clusters=clusters).fit_predict( sampledData[i]) elif singleName in ('ward', 'complete', 'average'): predicted_label = AgglomerativeClustering( linkage=singleName, n_clusters=clusters).fit_predict(sampledData[i]) predicted_labelAll.append(predicted_label.tolist()) ##对采样出来的数据集的预测标签集合 assinALLNnLabels = [] #全部的通过近邻分配的标签 #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以 for j in range(len(remainedData)): assinNnLabels = [] # 通过近邻分配的标签 for m in range(len(remainedData[j])): minDist = inf minindex = -1 for k in range(len(sampledData[j])): distJI = distEclud(remainedData[j][m], sampledData[j][k]) if distJI < minDist: minDist = distJI minindex = k assinNnLabels.append( predicted_labelAll[j][minindex]) #对除采样外的数据集的根据近邻关系得到的预测标签集合 assinALLNnLabels.append(assinNnLabels) #对两个预测标签和序列值分别进行组合 combineIndex = [] combinedLables = [] for column in range(len(predicted_labelAll)): combineIndexOne = sampledIndex[column] + remainedIndex[column] combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[ column] combineIndex.append(combineIndexOne) combinedLables.append(combinedLablesOne) #把打乱的序号按照从小到大排列出来,得到元素升序的序列值 seqIndexAll = [] for combineIndex1 in combineIndex: seqIndex = [] for seq in range(len(sampledData[0]) + len(remainedData[0])): for elementIndex in range(len(combineIndex1)): if combineIndex1[elementIndex] == seq: seqIndex.append(elementIndex) seqIndexAll.append(seqIndex) #得到真正的sampledData和remainedData组合后的标签值 finalLabel = [] for finalIndex in range(len(combinedLables)): finallabelone = [] for index in seqIndexAll[finalIndex]: finallabelone.append(combinedLables[finalIndex][index]) finalLabel.append(finallabelone) #最终聚类结果 return finalLabel
def fsrsnn(sampledData, remainedData, sampledIndex, remainedIndex, sampledDataFs, k): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 predicted_labelAll = [] for i in range(len(sampledData)): clusters = random.randint(min_clusters, max_clusters) # clusters = random.randint(2,11)#范围是[2,10] predicted_label = KMeans(n_clusters=clusters).fit_predict( sampledDataFs[i]) predicted_labelAll.append(predicted_label.tolist()) ##对采样出来的数据集的预测标签集合 assinALLNnLabels = [] #全部的通过近邻分配的标签 #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以 for j in range(len(remainedData)): assinNnLabels = [] # 通过近邻分配的标签 for m in range(len(remainedData[j])): minDist = inf minindex = -1 for k in range(len(sampledData[j])): distJI = distEclud(remainedData[j][m], sampledData[j][k]) # 计算质心和数据点之间的距离 if distJI < minDist: minDist = distJI minindex = k assinNnLabels.append( predicted_labelAll[j][minindex]) #对除采样外的数据集的根据近邻关系得到的预测标签集合 assinALLNnLabels.append(assinNnLabels) #对两个预测标签和序列值分别进行组合 combineIndex = [] combinedLables = [] for column in range(len(predicted_labelAll)): combineIndexOne = sampledIndex[column] + remainedIndex[column] combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[ column] combineIndex.append(combineIndexOne) combinedLables.append(combinedLablesOne) #把打乱的序号按照从小到大排列出来,得到元素升序的序列值 seqIndexAll = [] for combineIndex1 in combineIndex: seqIndex = [] for seq in range(len(sampledData[0]) + len(remainedData[0])): for elementIndex in range(len(combineIndex1)): if combineIndex1[elementIndex] == seq: seqIndex.append(elementIndex) seqIndexAll.append(seqIndex) #得到真正的sampledData和remainedData组合后的标签值 finalLabel = [] for finalIndex in range(len(combinedLables)): finallabelone = [] for index in seqIndexAll[finalIndex]: finallabelone.append(combinedLables[finalIndex][index]) finalLabel.append(finallabelone) #最终聚类结果 return finalLabel
def initialMultiRun(data, times, singleName): predicted_labelAll = [] for i in range(times): clusters = random.randint(2, 11) if singleName == "kmeans": predicted_label = KMeans(n_clusters=clusters).fit_predict(data) elif singleName in ('ward', 'average', 'complete'): predicted_label = AgglomerativeClustering( linkage=singleName, n_clusters=clusters).fit_predict(data) predicted_labelAll.append(predicted_label.tolist()) return predicted_labelAll
def main(): predicted_labelAll = [] datamat, datalabels = loadDataset("../dataset/iris.data") print 'data ready' nmi_max = -inf ari_max = -inf for i in range(10): clusters = random.randint(2, 11) predicted_label = KMeans(n_clusters=clusters).fit_predict(datamat) predicted_label = predicted_label.tolist() nmi = normalized_mutual_info_score(datalabels, predicted_label) ari = adjusted_rand_score(datalabels, predicted_label) if nmi > nmi_max: nmi_max = nmi if ari > ari_max: ari_max = ari print('nmi值为:') print(nmi_max) print('ari值为:') print(ari_max)