def kMeans(data, nrOFCluster, nrOfGivenIterations): centroids = findCentroids(nrOFCluster, data) listD = [[0 for i in range(0, nrOFCluster)], [0 for i in range(0, nrOFCluster)]] loopCounter = 0 givenError = 0.5 while loopCounter < nrOfGivenIterations: assignment = assignToCluster(nrOFCluster, data, centroids) # seeking for empty centroids empty = [] for e in range(0, nrOFCluster): if assignment.count(e) == 0: empty.append(e) # print "empty ", empty, "\n\n" #collecting empty centroids emptyCentroid = Dataset([]) emptyListD = [[], []] for q in range(0, empty.__len__()): emptyCentroid.getListOfVectors().append(centroids.getListOfVectors()[empty[q]]) emptyListD[0].append(listD[0][empty[q]]) emptyListD[1].append(listD[1][empty[q]]) #calculation D = meanQuantizationError(nrOFCluster, data, assignment, centroids) centroids = calculateNewCentroids(nrOFCluster, data, assignment, centroids) numberOfExistingCluster = [] if (loopCounter == 0): listD[0] = D else: if loopCounter == 1: listD[1] = D else: listD[0] = listD[1] listD[1] = D # check if all cluster have at least one Vector assigned # otherwise del from cluster, decrement nrOFCluster centroidsTmp = Dataset([]) listDTmp= [[],[]] for z in range(0, centroids.getListOfVectors().__len__()): ctr = 0 for q in range(0, centroids.getListOfVectors()[0].__len__()): if centroids.getListOfVectors()[z][q] != 0: ctr += 1 if ctr == 0: centroidsTmp.getListOfVectors().append(centroids.getListOfVectors()[z]) listDTmp[0].append(listD[0][z]) listDTmp[1].append(listD[1][z]) else: numberOfExistingCluster.append(z) #remove empty data from processing for q in range(0,centroidsTmp.getListOfVectors().__len__()): centroids.getListOfVectors().remove(centroidsTmp.getListOfVectors()[q]) listD[0].remove(listDTmp[0][q]) listD[1].remove(listDTmp[1][q]) nrOFCluster -= 1 #check if given error tha calculated QoS = 0 for q in range(0, numberOfExistingCluster.__len__()): if countDiff(listD[0][q], listD[1][q]) < givenError: QoS += 1 if QoS == centroids.getListOfVectors().__len__(): print "\n!!!!!!!!!!KONIEC!!!!!!!!!!" break #restore empty centroids centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors()) nrOFCluster += emptyCentroid.getListOfVectors().__len__() listD[1].extend(emptyListD[0]) listD[0].extend(emptyListD[1]) print "@@ centr= " , centroids.getListOfVectors() print "\n*******END OF LOOP " ,loopCounter, "\n" loopCounter += 1 for i in range(len(assignment)): print assignment[i], getAllFiles('../input')[i]
if QoS == centroids.getListOfVectors().__len__(): print "\n!!!!!!!!!!KONIEC!!!!!!!!!!" break #restore empty centroids centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors()) nrOFCluster += emptyCentroid.getListOfVectors().__len__() listD[1].extend(emptyListD[0]) listD[0].extend(emptyListD[1]) print "@@ centr= " , centroids.getListOfVectors() print "\n*******END OF LOOP " ,loopCounter, "\n" loopCounter += 1 for i in range(len(assignment)): print assignment[i], getAllFiles('../input')[i] nrOFCluster = 3 nrOfGivenIterations = 3 data = Dataset(prepareInput('../input', 10, '/home/kchrusci/Workspace/repo/k-means/projektpython/ForbiddenWords.txt')) print "data = ", data.getListOfVectors() kMeans(data,nrOFCluster,nrOfGivenIterations) print(getAllFiles('/home/koper/PycharmProjects/First/Samples'))