def findCenter(vector, centroids): dist = sys.maxint cluster = -1 i = 0 for center in centroids: if(Vectors.squared_distance(center, vector)<dist): dist = Vectors.squared_distance(center, vector) cluster = i i += 1 return cluster
def diferencia_minima(self, x): for centroide in range(self.k): if centroide == 0: dist_minima = Vectors.squared_distance( x, self.centroides[centroide]) llave_centroide = centroide else: if dist_minima > Vectors.squared_distance( x, self.centroides[centroide]): llave_centroide = centroide #x[0]=llave_centroide return llave_centroide
def silhoutte(point, err, num_clusters): avg = [0]*num_clusters avgi = [0]*num_clusters for er in err: avg[er[1]] += Vectors.squared_distance(point[0], er[0]) avgi[er[1]] += 1 a = avg[point[1]]/avgi[point[1]] b = sys.maxint for i in range(len(avg)): if(i != point[1]): if(avg[i]/avgi[i] < b): b = avg[i]/avgi[i] return (b - a)/max(b, a)
def runSequential(points, k): n = len(points) if k >= n: return points result = list() candidates = np.full(n, True) for _ in range(int(k / 2)): maxDist = 0.0 maxI = 0 maxJ = 0 for i in range(n): if candidates[i] == True: for j in range(n): d = Vectors.squared_distance(points[i], points[j]) if d > maxDist: maxDist = d maxI = i maxJ = j result.append(points[maxI]) result.append(points[maxJ]) #print "selecting "+str(maxI)+" and "+str(maxJ) candidates[maxI] = False candidates[maxJ] = False if k % 2 != 0: s = np.random.randint(n) for i in range(n): j = (i + s) % n if candidates[j] == True: #print "selecting "+str(j) result.append(points[i]) break return result
#Input into the Algorithm km = KMeans() kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018) centers = kme.clusterCenters err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect() #Silhoutte Value comparison ag = 0 agi = 0 for er in err: avg = [0] * num_clusters avgi = [0] * num_clusters for e in err: avg[e[1]] += Vectors.squared_distance(er[0], e[0]) avgi[e[1]] += 1 a = avg[er[1]] / avgi[er[1]] b = sys.maxint for i in range(len(avg)): if (i != er[1]): if (avg[i] / avgi[i] < b): b = avg[i] / avgi[i] ag += (b - a)/max(b, a) agi += 1 sil = (ag/agi) print(sil) # Number of points in each cluster
def distance(p1, p2): return np.sqrt(Vectors.squared_distance(p1, p2))
realCenters = [] with open('/home/ronald/centers.csv', 'r') as f: csvReader = csv.DictReader(f) for row in csvReader: center = [] for i in row: center.append(row[i]) realCenters.append(Vectors.dense(center)) perm = list(permutations([i for i in range(8)])) totalDist = [] for i in perm: dist = 0 for j in range(len(i)): dist += Vectors.squared_distance(modelCenters[j], realCenters[i[j]]) totalDist.append(dist) ref = [] minIndex, minValue = min(enumerate(totalDist), key=operator.itemgetter(1)) ref = perm[minIndex] # dataPoint = [] correct = 0 incorrect = 0 with open('/home/ronald/data.csv', 'r') as f: csvReader = csv.DictReader(f) for row in csvReader: data = [] for i in row: if i != 'target':
modelCenters = model.clusterCenters realCenters = [] with open('/home/ronald/centers.csv', 'r') as f: csvReader = csv.DictReader(f) for row in csvReader: center = [] for i in row: center.append(row[i]) realCenters.append(Vectors.dense(center)) distTable = [] for i in modelCenters: distRow = [] for j in realCenters: distRow.append(Vectors.squared_distance(i, j)) distTable.append(distRow) ref = [] for i in distTable: minIndex, minValue = min(enumerate(i), key=operator.itemgetter(1)) ref.append(minIndex) # print(str(minIndex)+' '+str(minValue)) # dataPoint = [] correct = 0 incorrect = 0 with open('/home/ronald/data.csv', 'r') as f: csvReader = csv.DictReader(f) for row in csvReader: data = []