def _test_k_means(self, seed): # Assert k-means clustering accuracy. A = [] n = 100 m = dict((d.vector.id, d.type) for d in self.model[:n]) for i in range(30): # Create two clusters of vectors. k = vector.kmeans([d.vector for d in self.model[:n]], k=2, seed=seed) # Measure the number of spam in each clusters. # Ideally, we have a cluster without spam and one with only spam. i = len([1 for v in k[0] if m[v.id] == False]) j = len([1 for v in k[1] if m[v.id] == False]) A.append(max(i,j) * 2.0 / n) # Return average accuracy after 10 tests. return sum(A) / 30.0