Ejemplo n.º 1
0
def findCentroids(nrOfClusters, data):
    # TODO write algorithm setting initial location of centroids
    print "-->IN findCentroids"
    centroidsList = Dataset([0  for i in range(0, nrOFCluster)])

    jump = (int((data.getSize()+1)/nrOfClusters) )
    for i in range(0,nrOfClusters):
        centroidsList.getListOfVectors()[i]= data.getListOfVectors()[jump * i]
    print "\tOUT Found centroids= ", centroidsList.getListOfVectors(), "\n"
    return centroidsList
Ejemplo n.º 2
0
def calculateNewCentroids(nrOfClusters, data, assignment, centroids):
    print "-->IN calculateNewCentroids"
    print "\tcentroids= ", centroids.getListOfVectors()

    # create DataSet object for centroids
    newCentroidsDataset = Dataset([Vector() for i in range(0, nrOfClusters)])

    # init with Zeros all Vectors
    for t in  range(0, newCentroidsDataset.getListOfVectors().__len__()):
        for q in range(0, data.getListOfVectors()[0].__len__()):
            newCentroidsDataset.getListOfVectors()[t].append(0)

    # arithmetic mean
    for i in range(0, data.getListOfVectors().__len__()):
        for x in range(0, data.getListOfVectors()[0].__len__()):
            newCentroidsDataset.getListOfVectors()[assignment[i]][x] += data.getListOfVectors()[i][x]

    for q in range(0, nrOfClusters):
        for x in range(0, data.getListOfVectors()[0].__len__()):
            if assignment.count(q) != 0:
                newCentroidsDataset.getListOfVectors()[q][x] = newCentroidsDataset.getListOfVectors()[q][x] / assignment.count(q)
            else:
                newCentroidsDataset.getListOfVectors()[q][x] = 0


    print "\t\nOUT list of new centroids= ", newCentroidsDataset.getListOfVectors(), "\n"
    return newCentroidsDataset
Ejemplo n.º 3
0
def findCentroids(nrOfClusters, data):
    # TODO write algo finding location of centroids
    print "-->IN findCentroids"
    print "\tnrOfClusters= ", nrOfClusters, "\n\tdata= ", data.getListOfVectors()
    centroidsList = Dataset([Vector() for i in range(0, nrOFCluster)])
    # print "\tcentroidsList= " , centroidsList.getListOfVectors()

    jump = (int((data.getSize()+1)/nrOfClusters) )
    # print "\tjump between indexes= ", jump
    for i in range(0,nrOfClusters):
        # print "\t\tposition=", i, " Vector= ",data.getListOfVectors()[jump * i]
        centroidsList.getListOfVectors()[i]= data.getListOfVectors()[jump * i]
    print "\tOUT Founded centroids= ", centroidsList.getListOfVectors(), "\n"
    return centroidsList
Ejemplo n.º 4
0
def reconcile(data, centroids, nrOFCluster, assignment, Dlist, whichCluster):
    print "-->IN reconcile"

    newData = Dataset([])
    newAssignment = []
    newCentroids = Dataset([])
    newDList= [[],[]]

    for x in range(0, data.getListOfVectors().__len__()):
        if assignment[x] == whichCluster:
            newData.getListOfVectors().append(data.getListOfVectors()[x])
            newAssignment.append(assignment[x])
            if newCentroids.getListOfVectors().__len__() == 0:

                newlist = [field for field in assignment]
                newlist.sort()
                counterTmp = [newlist[0]]
                for w in range(0, newlist.__len__()-1):
                    if newlist == whichCluster:
                        break
                    if newlist[w] != newlist[w+1]:
                        counterTmp.append(newlist[w+1])
                position = counterTmp.index(whichCluster)
                print newDList
                print Dlist
                print assignment[x]

                newCentroids.getListOfVectors().append(centroids.getListOfVectors()[position])
                newDList[0].append(Dlist[0][position])
                newDList[1].append(Dlist[1][position])
            print "\t ", x
    # newNrOFCluster = nrOFCluster - 1

    print "\nRECONCILIE:"
    print "## Data= ", newData.getListOfVectors()
    print "## Centroids= " , newCentroids.getListOfVectors()
    # print "## nrOFCluster" , newNrOFCluster
    print "## assignment=", newAssignment
    print "## Dlist=", newDList
    print "## Q= ", q
    print "which cluster=", whichCluster
    print "\n"

    return [newData, newCentroids, newAssignment, newDList]
Ejemplo n.º 5
0
def calculateNewCentroids(nrOfClusters, data, assignment, centroids):
    print "-->IN calculateNewCentroids"
    print "\tnrOfClusters= ", nrOfClusters, "\n\tdata= ", data.getListOfVectors(), "\n\tassignment=", assignment, "\n\tcentroids= ", centroids.getListOfVectors()

    # create DataSet object for centroids
    newCentroidsDataset = Dataset([Vector() for i in range(0, nrOfClusters)])
    # init with Zeros all Vectors
    for t in  range(0, newCentroidsDataset.getListOfVectors().__len__()):
        for q in range(0, data.getListOfVectors()[0].__len__()):
            newCentroidsDataset.getListOfVectors()[t].append(0)

    # arithmetic mean
    for i in range(0, data.getListOfVectors().__len__()):
        for x in range(0, data.getListOfVectors()[0].__len__()):
            # print "list of new centroids= ", listOfNewCentroids
            newCentroidsDataset.getListOfVectors()[assignment[i]][x] += data.getListOfVectors()[i][x]
            # print "list of new centroids after loop= ", listOfNewCentroids
    for q in range(0, nrOfClusters):
        for x in range(0, data.getListOfVectors()[0].__len__()):
            newCentroidsDataset.getListOfVectors()[q][x] = newCentroidsDataset.getListOfVectors()[q][x] / data.getListOfVectors()[0].__len__()


    print "\tOUT list of new centroids= ", newCentroidsDataset.getListOfVectors(), "\n"
    return  newCentroidsDataset
Ejemplo n.º 6
0
def kMeans(data, nrOFCluster, nrOfGivenIterations):

    centroids = findCentroids(nrOFCluster, data)
    listD = [[0 for i in range(0, nrOFCluster)], [0 for i in range(0, nrOFCluster)]]
    loopCounter = 0
    givenError = 0.5

    while loopCounter < nrOfGivenIterations:
        assignment = assignToCluster(nrOFCluster, data, centroids)

        # seeking for empty centroids
        empty = []
        for e in range(0, nrOFCluster):
            if assignment.count(e) == 0:
                empty.append(e)
        # print "empty ", empty, "\n\n"

        #collecting empty centroids
        emptyCentroid = Dataset([])
        emptyListD = [[], []]
        for q in range(0, empty.__len__()):
            emptyCentroid.getListOfVectors().append(centroids.getListOfVectors()[empty[q]])
            emptyListD[0].append(listD[0][empty[q]])
            emptyListD[1].append(listD[1][empty[q]])

        #calculation
        D = meanQuantizationError(nrOFCluster, data, assignment, centroids)
        centroids = calculateNewCentroids(nrOFCluster, data, assignment, centroids)
        numberOfExistingCluster = []

        if (loopCounter == 0):
            listD[0] = D
        else:
            if loopCounter == 1:
                listD[1] = D
            else:
                listD[0] = listD[1]
                listD[1] = D

            # check if all cluster have at least one Vector assigned
            # otherwise del from cluster, decrement nrOFCluster
            centroidsTmp = Dataset([])
            listDTmp= [[],[]]

            for z in range(0, centroids.getListOfVectors().__len__()):
                ctr = 0
                for q in range(0, centroids.getListOfVectors()[0].__len__()):
                    if centroids.getListOfVectors()[z][q] != 0:
                        ctr += 1
                if ctr == 0:
                    centroidsTmp.getListOfVectors().append(centroids.getListOfVectors()[z])
                    listDTmp[0].append(listD[0][z])
                    listDTmp[1].append(listD[1][z])
                else:
                    numberOfExistingCluster.append(z)

            #remove empty data from processing
            for q in range(0,centroidsTmp.getListOfVectors().__len__()):
                centroids.getListOfVectors().remove(centroidsTmp.getListOfVectors()[q])
                listD[0].remove(listDTmp[0][q])
                listD[1].remove(listDTmp[1][q])
                nrOFCluster -= 1

            #check if given error tha calculated
            QoS = 0
            for q in range(0, numberOfExistingCluster.__len__()):
                if countDiff(listD[0][q], listD[1][q]) < givenError:
                    QoS += 1


            if QoS == centroids.getListOfVectors().__len__():
                print "\n!!!!!!!!!!KONIEC!!!!!!!!!!"
                break

            #restore empty centroids
            centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors())
            nrOFCluster += emptyCentroid.getListOfVectors().__len__()
            listD[1].extend(emptyListD[0])
            listD[0].extend(emptyListD[1])

            print "@@ centr= " , centroids.getListOfVectors()

        print "\n*******END OF LOOP " ,loopCounter, "\n"
        loopCounter += 1

    for i in range(len(assignment)):
        print assignment[i], getAllFiles('../input')[i]
Ejemplo n.º 7
0
            if QoS == centroids.getListOfVectors().__len__():
                print "\n!!!!!!!!!!KONIEC!!!!!!!!!!"
                break

            #restore empty centroids
            centroids.getListOfVectors().extend(emptyCentroid.getListOfVectors())
            nrOFCluster += emptyCentroid.getListOfVectors().__len__()
            listD[1].extend(emptyListD[0])
            listD[0].extend(emptyListD[1])

            print "@@ centr= " , centroids.getListOfVectors()

        print "\n*******END OF LOOP " ,loopCounter, "\n"
        loopCounter += 1

    for i in range(len(assignment)):
        print assignment[i], getAllFiles('../input')[i]



nrOFCluster = 3
nrOfGivenIterations = 3

data = Dataset(prepareInput('../input', 10, '/home/kchrusci/Workspace/repo/k-means/projektpython/ForbiddenWords.txt'))
print "data = ", data.getListOfVectors()

kMeans(data,nrOFCluster,nrOfGivenIterations)
print(getAllFiles('/home/koper/PycharmProjects/First/Samples'))


Ejemplo n.º 8
0
def kMeans(data, nrOFCluster, nrOfGivenIterations):

    centroids = findCentroids(nrOFCluster, data)
    listD = [[0 for i in range(0, nrOFCluster)], [0 for i in range(0, nrOFCluster)]]
    loopCounter = 0
    givenError = 0.5

    while loopCounter < nrOfGivenIterations:
        assignment = assignToCluster(nrOFCluster, data, centroids)
        D = meanQuantizationError(nrOFCluster, data, assignment, centroids)
        centroids = calculateNewCentroids(nrOFCluster, data, assignment, centroids)
        numberOfExistingCluster = []

        if (loopCounter == 0):
            listD[0] = D
            print "@ListD in loop 0= " , listD
        else:
            if loopCounter == 1:
                listD[1] = D
            else:
                listD[0] = listD[1]
                listD[1] = D
            print "@nrOFCluster before loop=", nrOFCluster, "listD= ", listD



            # check if all cluster have at least one Vector assigned
            # otherwise del from cluster, decrement nrOFCluster
            centroidsTmp = Dataset([])
            listDTmp= [[],[]]

            for z in range(0, centroids.getListOfVectors().__len__()):
                ctr = 0
                for q in range(0, centroids.getListOfVectors()[0].__len__()):
                    if centroids.getListOfVectors()[z][q] != 0:
                        ctr += 1
                if ctr == 0:
                    centroidsTmp.getListOfVectors().append(centroids.getListOfVectors()[z])
                    listDTmp[0].append(listD[0][z])
                    listDTmp[1].append(listD[1][z])

                else:
                    numberOfExistingCluster.append(z)
                    # print z
                    # print assignment[z]
            print "listDTmp=" , listDTmp
            print "centroidsTmp= ", centroidsTmp.getListOfVectors()
            print "~\n\n\nnumberOfExistingCluster= " ,numberOfExistingCluster


            for q in range(0,centroidsTmp.getListOfVectors().__len__()):
                centroids.getListOfVectors().remove(centroidsTmp.getListOfVectors()[q])
                listD[0].remove(listDTmp[0][q])
                listD[1].remove(listDTmp[1][q])
                nrOFCluster -= 1
            print "\n\nExtracted valid data "
            print "#Data= ", data.getListOfVectors()
            print "#centroids= ",  centroids.getListOfVectors()
            print "#nrOFCluster" , nrOFCluster
            print "#assignment=", assignment
            print "#listD= ", listD
            print "\n\n"


            newData = Dataset([])
            newAssignment = []
            newCentroids = Dataset([])
            newNrOfCluster = -1
            newDlist = [[],[]]
            datVecList = newData.getListOfVectors()
            centVecList = newCentroids.getListOfVectors()

            nrOFClustertemp = nrOFCluster
            for q in range(0, numberOfExistingCluster.__len__()):
                print "@@ ListD in loop= " , listD
                print "@@ Data= ", data.getListOfVectors()
                print "@@ centr= " , centroids.getListOfVectors()
                print "@@ nrOFCluster" , nrOFCluster
                print "@@ assignment=", assignment
                print "@@ Q in RECONCILIATION LOOP= ", q
                print "\n"

                if countDiff(listD[0][q], listD[1][q]) < givenError:
                    # print "nrOFCluster= " , nrOFCluster
                    datatemp, centroidstemp, assignmenttemp, listDtemp = reconcile(data,centroids,nrOFCluster, assignment, listD, numberOfExistingCluster[q])
                    datVecList += datatemp.getListOfVectors()
                    centVecList += centroidstemp.getListOfVectors()
                    nrOFClustertemp -= 1
                    newAssignment += assignmenttemp
                    newDlist[0] += listDtemp[0]
                    newDlist[1] += listDtemp[1]
                    print " #########x" , datVecList
                    print " #########x" , centVecList
                    print " #########x" , newNrOfCluster
                    print " #########x" , newAssignment
                    print " #########x" , newDlist


                # print "\nData after reconc:"
                # print "newData=", newData.getListOfVectors()
                # print "newCentroids", newCentroids.getListOfVectors()
                # print "nrOFCluster" , nrOFClustertemp
                # print "newAssignment" ,newAssignment
                # print "\n"

            for k in range(0, newData.getListOfVectors().__len__()):
                data.getListOfVectors().remove(newData.getListOfVectors()[k])
                assignment.remove(newAssignment[k])

            print "\nData after DELETION="
            print "newData=", data.getListOfVectors()
            print "assignment " , assignment

            for q in range(0,newCentroids.getListOfVectors().__len__()):
                centroids.getListOfVectors().remove(newCentroids.getListOfVectors()[q])
                listD[0].remove(newDlist[0][q])
                listD[1].remove(newDlist[1][q])
            print "newCentroids", centroids.getListOfVectors()
            print "Dlist ", listD

            # centroids = newCentroids
            # listD = newDlist
            nrOFCluster = nrOFClustertemp
            # data = newData
            # assignment = newAssignment

        print "*******Loop counter= " ,loopCounter
        loopCounter += 1

        if (data.getListOfVectors().__len__() == 0 ):
            break