def update(self): if self.coveredDataset == self.nextDataset: self.nextDataset = Dataset(normalized=True, data=[]) return False self.coveredDataset = self.nextDataset.copy() self.position = self.coveredDataset.median() return True
def __init__(self, id, dimensions, key=lambda x: x): self.id = id self.dimensions = dimensions self.key = key if self.dimensions is None: self.position = [] else: self.position = [random.random() for i in range(dimensions)] self.coveredDataset = None self.nextDataset = Dataset(normalized=True, data=[]) self.dirt = False self.meanSquaredError = None self.totalSquaredError = None
def getOffsetsGrouped(self, groups, image, kpsA, kpsB, matches, status): offsets = self.getOffsets(kpsA, kpsB, matches, status) self.offsetsDataset = Dataset(data=offsets) k = BisectingKmeans(dataset=self.offsetsDataset, k=groups, trials=5, maxRounds=10, key=lambda x: [x[0], x[2], x[3]]) k.run() return k.means
class Mean(): """ A mean of the k-means. """ def __init__(self, id, dimensions, key=lambda x: x): self.id = id self.dimensions = dimensions self.key = key if self.dimensions is None: self.position = [] else: self.position = [random.random() for i in range(dimensions)] self.coveredDataset = None self.nextDataset = Dataset(normalized=True, data=[]) self.dirt = False self.meanSquaredError = None self.totalSquaredError = None def update(self): if self.coveredDataset == self.nextDataset: self.nextDataset = Dataset(normalized=True, data=[]) return False self.coveredDataset = self.nextDataset.copy() self.position = self.coveredDataset.median() return True def clear(self): self.nextDataset = Dataset(normalized=True, data=[]) def cover(self, point): self.nextDataset.append(point) self.dirt = True def __repr__(self): return "<Mean id: %i position:%s dataset:%s>" % ( self.id, str(self.position), self.coveredDataset.__repr__()) def distanceSqrd(self, point): def distSqrd(v1, v2): return sum([(j - v2[i]) ** 2 for i, j in enumerate(v1)]) return distSqrd(self.key(point), self.position) def getMeanSquaredError(self, key=lambda x: x): if len(self.coveredDataset) == 0: return float('inf') if self.dirt or self.meanSquaredError is None: self.meanSquaredError = self.getTotalSquaredError() /\ len(self.coveredDataset) self.dirt = False return self.meanSquaredError def getTotalSquaredError(self, key=lambda x: x): if self.dirt or self.meanSquaredError is None: squaredDists = [self.distanceSqrd(point) for point in self.coveredDataset] self.totalSquaredError = sum(squaredDists) self.dirt = False return self.totalSquaredError def getCoveredDataset(self, limits=None, normalized=True): if normalized: return self.coveredDataset.genUnnormalized(limits) else: return self.coveredDataset
def clear(self): self.nextDataset = Dataset(normalized=True, data=[])
def __init__(self, dataset, k, trials, maxRounds, key=lambda x: x): """ dataset - The aim dataset k - The number of means trials - How many times the algorithm will be executed maxRounds - The maximum number of iterations before stop each execution """ self.dataset = dataset self.k = k self.trials = trials self.maxRounds = maxRounds self.key = key def run(self): self.solutions = [KmeansSolution(self.dataset, self.k, self.maxRounds, key=self.key) for t in range(self.trials)] self.solutions.sort(key=lambda s: s.meanSquaredError) def getBestSolution(self): return self.solutions[0] def showResults(self): for solution in self.solutions: print(solution) if __name__ == "__main__": ds = Dataset(data=[[0, 0], [1, 1]]) k = Kmeans(dataset=ds, k=2, trials=5, maxRounds=3) k.run() k.showResults()
from kmeans.dataset import Dataset from kmeans.bisectingKmeans import BisectingKmeans from utils import DataLoader, MeansVisualizer, KneeFinder irisFields = [{ 'name': 'sepal length' }, { 'name': 'sepal width' }, { 'name': 'petal length' }, { 'name': 'petal width' }, { 'name': 'class', 'types': { 'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2 } }] loader = DataLoader('iris.data', irisFields) ds = Dataset(data=loader.data) kf = KneeFinder(dataset=ds, krange=[1, 10], trials=50, maxRounds=100) kf.run() kf.show() bisection = BisectingKmeans(dataset=ds, k=2, trials=30, maxRounds=100) bisection.run() visualizer = MeansVisualizer(bisection.means, irisFields) visualizer.show()
worstDataset = worstCluster.coveredDataset bisection = Kmeans(dataset=worstDataset, k=2, trials=self.trials, maxRounds=self.trials, key=self.key) bisection.run() bisectionSolution = bisection.getBestSolution() self.means += bisectionSolution.means worstCluster = max(self.means, key=lambda m: m.getMeanSquaredError()) # if the number of means is not enouth remove the worst cluster # found to bisect it in the next iteration. if len(self.means) < self.k: self.means.remove(worstCluster) self.setMeanSquaredError() def showResults(self): print('\n\n'.join([str(mean) for mean in self.means])) if __name__ == "__main__": ds = Dataset(data=[[0, 0], [1, 1], [0.9, 0.9], [0.5, 0.5]]) bisection = BisectingKmeans(dataset=ds, k=4, trials=20, maxRounds=3) bisection.run() bisection.showResults()