def findCenter(vector, centroids):
    dist = sys.maxint
    cluster = -1
    i = 0
    for center in centroids:
        if(Vectors.squared_distance(center, vector)<dist):
            dist = Vectors.squared_distance(center, vector)
            cluster = i
        i += 1
    return cluster
Exemple #2
0
 def diferencia_minima(self, x):
     for centroide in range(self.k):
         if centroide == 0:
             dist_minima = Vectors.squared_distance(
                 x, self.centroides[centroide])
             llave_centroide = centroide
         else:
             if dist_minima > Vectors.squared_distance(
                     x, self.centroides[centroide]):
                 llave_centroide = centroide
     #x[0]=llave_centroide
     return llave_centroide
def silhoutte(point, err, num_clusters):
    avg = [0]*num_clusters
    avgi = [0]*num_clusters
    for er in err:
        avg[er[1]] += Vectors.squared_distance(point[0], er[0])
        avgi[er[1]] += 1
    a = avg[point[1]]/avgi[point[1]]
    b = sys.maxint
    for i in range(len(avg)):
        if(i != point[1]):
            if(avg[i]/avgi[i] < b):
                b = avg[i]/avgi[i]
    return (b - a)/max(b, a)
Exemple #4
0
def runSequential(points, k):

    n = len(points)
    if k >= n:
        return points

    result = list()
    candidates = np.full(n, True)

    for _ in range(int(k / 2)):
        maxDist = 0.0
        maxI = 0
        maxJ = 0
        for i in range(n):
            if candidates[i] == True:
                for j in range(n):
                    d = Vectors.squared_distance(points[i], points[j])
                    if d > maxDist:
                        maxDist = d
                        maxI = i
                        maxJ = j
        result.append(points[maxI])
        result.append(points[maxJ])
        #print "selecting "+str(maxI)+" and "+str(maxJ)
        candidates[maxI] = False
        candidates[maxJ] = False

    if k % 2 != 0:
        s = np.random.randint(n)
        for i in range(n):
            j = (i + s) % n
            if candidates[j] == True:
                #print "selecting "+str(j)
                result.append(points[i])
                break

    return result
#Input into the Algorithm
km = KMeans()
kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018)
centers = kme.clusterCenters

err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect()

#Silhoutte Value comparison
ag = 0
agi = 0
for er in err:
    avg = [0] * num_clusters
    avgi = [0] * num_clusters
    for e in err:
        avg[e[1]] += Vectors.squared_distance(er[0], e[0])
        avgi[e[1]] += 1
    a = avg[er[1]] / avgi[er[1]]
    b = sys.maxint
    for i in range(len(avg)):
        if (i != er[1]):
            if (avg[i] / avgi[i] < b):
                b = avg[i] / avgi[i]
    ag += (b - a)/max(b, a)
    agi += 1

sil = (ag/agi)

print(sil)

# Number of points in each cluster
Exemple #6
0
def distance(p1, p2):
    return np.sqrt(Vectors.squared_distance(p1, p2))
Exemple #7
0
realCenters = []
with open('/home/ronald/centers.csv', 'r') as f:
    csvReader = csv.DictReader(f)
    for row in csvReader:
        center = []
        for i in row:
            center.append(row[i])
        realCenters.append(Vectors.dense(center))

perm = list(permutations([i for i in range(8)]))

totalDist = []
for i in perm:
    dist = 0
    for j in range(len(i)):
        dist += Vectors.squared_distance(modelCenters[j], realCenters[i[j]])
    totalDist.append(dist)

ref = []
minIndex, minValue = min(enumerate(totalDist), key=operator.itemgetter(1))
ref = perm[minIndex]

# dataPoint = []
correct = 0
incorrect = 0
with open('/home/ronald/data.csv', 'r') as f:
    csvReader = csv.DictReader(f)
    for row in csvReader:
        data = []
        for i in row:
            if i != 'target':
Exemple #8
0
modelCenters = model.clusterCenters
realCenters = []
with open('/home/ronald/centers.csv', 'r') as f:
    csvReader = csv.DictReader(f)
    for row in csvReader:
        center = []
        for i in row:
            center.append(row[i])
        realCenters.append(Vectors.dense(center))

distTable = []

for i in modelCenters:
    distRow = []
    for j in realCenters:
        distRow.append(Vectors.squared_distance(i, j))
    distTable.append(distRow)

ref = []
for i in distTable:
    minIndex, minValue = min(enumerate(i), key=operator.itemgetter(1))
    ref.append(minIndex)
    # print(str(minIndex)+' '+str(minValue))

# dataPoint = []
correct = 0
incorrect = 0
with open('/home/ronald/data.csv', 'r') as f:
    csvReader = csv.DictReader(f)
    for row in csvReader:
        data = []