def kmedoidsWithScores(filenameData, filenameSilhMean, nameDBS, nameCHS,
                       kClusters, measure):
    path = pathlib.Path(str(root) + '\\' + filenameData)
    if path.is_file():
        data = read_sample(path)

        clusters, predicted = kmedoidsRun(data, kClusters, measure)

        meanSilhouetteScore = meanSilh(data, clusters)
        witTXT(meanSilhouetteScore,
               filenameSilhMean,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))

        dbsScore = dbs(data, predicted)
        witTXT(dbsScore,
               nameDBS,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))

        chsScore = chs(data, predicted)
        witTXT(chsScore,
               nameCHS,
               filepath=root,
               note=filenameData + " k: " + str(kClusters))
Beispiel #2
0
def kmedoidsWithScores(filenameData, filenameSilhMean, filenameDBS,
                       filenameCHS, kClusters):
    data = read_sample(str(root) + '\\' + filenameData)

    #kClusters = canoc(data, kmin, kmax)

    initial_medoids = randomCenters(len(data), kClusters)
    kmedoids_instance = kmedoids(data, initial_medoids, metric=metricResearch)

    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    predicted = kmedoids_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    witTXT(meanSilhouetteScore,
           filenameSilhMean,
           filepath=root,
           note='k: ' + str(kClusters))

    dbsScore = dbs(data, predicted)
    witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters))

    chsScore = chs(data, predicted)
    witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
Beispiel #3
0
def kFun(D, X):
    m, n = np.shape(D)
    K = 0
    for i in range(len(X)):
        K += math.pow(2, i) * X[len(X) - 1 - i]
    K = int(K) + 1  
    initSet = set()
    curK = K
    if (K == 1):
        return 2
    while(curK>0):  # 随机选取k个样本
        randomInt = random.randint(0, m-1)
        if randomInt not in initSet:
            curK -= 1
            initSet.add(randomInt)
    U = D[list(initSet), :]  # 均值向量,即质心
    C = np.zeros(m)
    # 计算样本到各均值向量的距离
    for i in range(m):
        p = 0;
        minDistance = distance(D[i], U[0]);
        for j in range(1, K):
            if distance(D[i], U[j]) < minDistance:
                p = j
                minDistance = distance(D[i], U[j])
        C[i] = p
    a = dbs(D, C)
    a = a if a > 0 else 2
    return a
Beispiel #4
0
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS,
                      k_clusters, measure, kmin, kmax):
    data = read_sample(str(root) + '\\' + nameData)

    initial_medians = kppi(data, k_clusters).initialize()
    kmedians_instance = kmedians(data, initial_medians)
    kmedians_instance.process()

    clusters = kmedians_instance.get_clusters()
    #    final_medians = kmedians_instance.get_medians()

    predicted = kmedians_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)

    elbow_instance = elbow(data, kmin, kmax)
    elbow_instance.process()
    amount_clusters = elbow_instance.get_amount(
    )  # most probable amount of clusters
    wce = elbow_instance.get_wce()
def averFitness(func, X, K, number, maxIter):
    s = []
    for i in range(number):
        # U, C, iter, cluster, dbsLists = func(X, K, maxIter)
        U, C, iter = func(X, K, maxIter)
        # U, C, iter, cluster, dbsLists = func(X, K, maxIter)
        s.append(dbs(X, C))
    return max(s), min(s), sum(s) / number
def kmeans(data,k, maxIter):
    def _distance(p1,p2):
        """
        Return Eclud distance between two points.
        p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414
        """
        tmp = np.sum((p1-p2)**2)
        return np.sqrt(tmp)
    def _rand_center(data,k):
        """Generate k center within the range of data set."""
        n = data.shape[1] # features
        centroids = np.zeros((k,n)) # init with (0,0)....
        for i in range(n):
            dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
            centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
        return centroids
    def _converged(centroids1, centroids2):
        
        # if centroids not changed, we say 'converged'
         set1 = set([tuple(c) for c in centroids1])
         set2 = set([tuple(c) for c in centroids2])
         return (set1 == set2)
        
    dbsList = [float('inf')]
    n = data.shape[0] # number of entries
    centroids = _rand_center(data,k)
    label = np.zeros(n,dtype=np.int) # track the nearest centroid
    assement = np.zeros(n) # for the assement of our model
    converged = False
    curIter = 0
    while not converged:
        curIter += 1
        old_centroids = np.copy(centroids)
        for i in range(n):
            # determine the nearest centroid and track it with label
            min_dist, min_index = np.inf, -1
            for j in range(k):
                dist = _distance(data[i],centroids[j])
                if dist < min_dist:
                    min_dist, min_index = dist, j
                    label[i] = j
            assement[i] = _distance(data[i],centroids[label[i]])**2
        
        # update centroid
        dbsList.append(dbs(data, label))
        new_centroids = []
        for m in range(k):
            if len(data[label==m]) == 0:
                k -= 1
            else:
             centroids[m] = np.mean(data[label==m],axis=0)
             new_centroids.append(centroids[m])
        centroids = new_centroids
        converged = _converged(old_centroids,centroids)
    dbsList = dbsList + [dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList))]
    print('dbsList', dbsList)
    return centroids, label, dbsList
def kcluster(rows, k, maxIter):
    m, n = np.shape(rows)
    #   # 确定每个点的最大值和最小值,给随机数定个范围
    #   ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows]))
    #   for i in range(len(rows[0]))]

    m, dim = np.shape(rows)
    GbestScore, GbestPositon, Curve = BOAK(pop, k, rows)
    U = GbestPositon[0]
    # 随机建立k个中心点
    #   clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0]
    #   for i in range(len(rows[0]))] for j in range(k)]
    clusters = np.zeros([k, dim])
    for i in range(k):
        clusters[i] = U[i * dim:(i + 1) * dim]

    print('clusters', clusters)

    lastmatches = None
    # 设定循环100次,看你的数据大小,次数自定义

    dbsList = [float('inf')]
    C = np.zeros(m)
    for t in range(100):
        bestmatches = [[] for i in range(k)]

        # 在每一行中寻找距离最近的中心点
        for j in range(len(rows)):
            row = rows[j]
            bestmatch = 0
            for i in range(k):
                d = distance(clusters[i], row)
                if d < distance(clusters[bestmatch], row): bestmatch = i
            C[j] = bestmatch
            bestmatches[bestmatch].append(j)

        # 如果结果与上一次的相同,则整个过程结束
        if bestmatches == lastmatches: break
        lastmatches = bestmatches
        dbsList.append(dbs(rows, C))
        # 将中心点移到其所有成员的平均位置处
        for i in range(k):
            avgs = [0.0] * len(rows[0])
            if len(bestmatches[i]) > 0:
                for rowid in bestmatches[i]:
                    for m in range(len(rows[rowid])):
                        avgs[m] += rows[rowid][m]
                for j in range(len(avgs)):
                    avgs[j] /= len(bestmatches[i])
                clusters[i] = avgs
    dbsList = dbsList + [
        dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList))
    ]
    print('dbsList', dbsList)
    return bestmatches, C, dbsList
def run_trial(X, labels, k):
    errors = '"'

    # Run our dbscan
    start = time()
    """
    if metric == 'seuclidean':
        db = KMeans(eps,minPts,metric=metric,metric_params={'V':V})
    else:
        db = kmean(,minPts,metric=metric)
    """
    db = KMeans(k, n_jobs=12)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        ss_score = ss(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    errors += '"'

    return [
        k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score,
        errors
    ]
def kFun(D, X, K):
    m, dim = np.shape(D)
    result = 0
    U = np.zeros([K, dim])
    for i in range(K):
        U[i] = X[i * dim:(i + 1) * dim]
    C = np.zeros(m)
    # 计算样本到各均值向量的距离
    for i in range(m):
        p = 0
        minDistance = distance(D[i], U[0])
        for j in range(1, K):
            if distance(D[i], U[j]) < minDistance:
                p = j
                minDistance = distance(D[i], U[j])
        C[i] = p
    #     result += minDistance
    # return result
    if len(set(C)) == 1:
        return float('inf')
    return dbs(D, C)
    except Exception as e:
        print(e)
        ss_seu = str(np.nan)

    try:
        ss_cor = str(ss(X, labels, metric='correlation'))
    except Exception as e:
        print(e)
        ss_cor = str(np.nan)

    try:
        ss_cos = str(ss(X, labels, metric='cosine'))
    except Exception as e:
        print(e)
        ss_cos = str(np.nan)

    try:
        vrc_score = str(vrc(X, labels))
    except Exception as e:
        print(e)
        vrc_score = str(np.nan)

    try:
        dbs_score = str(dbs(X, labels))
    except Exception as e:
        print(e)
        dbs_score = str(np.nan)

    print(','.join(
        [sys.argv[1], ss_euc, ss_seu, ss_cor, ss_cos, vrc_score, dbs_score]))
        # 判断质心是否发生变化,如果发生变化则继续迭代,否则结束
        for i in range(K):
            newU[i] /= cnt[i]
            for j in range(n):
                if U[i, j] != newU[i, j]:
                    changed = 1
                    U[i, j] = newU[i, j]
        if changed == 0:
            return U, C, maxIter - curIter
    return U, C, maxIter - curIter


U, C, iter = Kmeans(data, 3, 4)

f1 = plt.figure(1)
plt.title('watermelon_4')
plt.xlabel('density')
plt.ylabel('ratio')
plt.scatter(data[:, 0], data[:, 1], marker='o', color='g', s=50)
plt.scatter(U[:, 0], U[:, 1], marker='o', color='r', s=100)
m, n = np.shape(data)
for i in range(m):
    plt.plot([data[i, 0], U[int(C[i]), 0]], [data[i, 1], U[int(C[i]), 1]],
             "c--",
             linewidth=0.3)
plt.show()

from sklearn.metrics import davies_bouldin_score as dbs

print(dbs(data, C))
# %%
def kmeans(data, K, maxIter):
    m, dim = np.shape(data)
    k = K
    GbestScore, GbestPositon, Curve = BOAK(pop, k, data)
    # GbestPositon = [[4.3, 2, 3.10221011, 0.1,4.3, 2,1,0.1,4.3,2,1,0.1]]
    U = GbestPositon[0]

    def _distance(p1, p2):
        """
        Return Eclud distance between two points.
        p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414
        """
        return np.sqrt(np.sum(np.square(np.array(p1) - np.array(p2))))

    def _rand_center(data, k):
        """Generate k center within the range of data set."""
        n = data.shape[1]  # features
        centroids = np.zeros((k, n))  # init with (0,0)....
        for i in range(n):
            dmin, dmax = np.min(data[:, i]), np.max(data[:, i])
            centroids[:, i] = dmin + (dmax - dmin) * np.random.rand(k)
        return centroids

    def _converged(centroids1, centroids2):

        # if centroids not changed, we say 'converged'
        set1 = set([tuple(c) for c in centroids1])
        set2 = set([tuple(c) for c in centroids2])
        return (set1 == set2)

    dbsList = [float('inf')]
    n = data.shape[0]  # number of entries
    centroids = np.zeros([k, dim])
    for i in range(k):
        centroids[i] = U[i * dim:(i + 1) * dim]
    label = np.zeros(n, dtype=np.int)  # track the nearest centroid
    assement = np.zeros(n)  # for the assement of our model
    converged = False
    old_centroids = np.copy(centroids)
    for i in range(n):
        # determine the nearest centroid and track it with label
        min_dist, min_index = np.inf, -1
        for j in range(k):
            dist = _distance(data[i], centroids[j])
            if dist < min_dist:
                min_dist, min_index = dist, j
                label[i] = j
        assement[i] = _distance(data[i], centroids[label[i]])**2

    # update centroid
    dbsList.append(dbs(data, label))
    new_centroids = []
    for m in range(k):
        if len(data[label == m]) == 0:
            k -= 1
        else:
            centroids[m] = np.mean(data[label == m], axis=0)
            new_centroids.append(centroids[m])
    centroids = new_centroids
    converged = _converged(old_centroids, centroids)
    # while not converged:
    #     old_centroids = np.copy(centroids)
    #     for i in range(n):
    #         # determine the nearest centroid and track it with label
    #         min_dist, min_index = np.inf, -1
    #         for j in range(k):
    #             dist = _distance(data[i],centroids[j])
    #             if dist < min_dist:
    #                 min_dist, min_index = dist, j
    #                 label[i] = j
    #         assement[i] = _distance(data[i],centroids[label[i]])**2

    #     # update centroid
    #     dbsList.append(dbs(data, label))
    #     new_centroids = []
    #     for m in range(k):
    #         if len(data[label==m]) == 0:
    #             k -= 1
    #         else:
    #          centroids[m] = np.mean(data[label==m],axis=0)
    #          new_centroids.append(centroids[m])
    #     centroids = new_centroids
    #     converged = _converged(old_centroids,centroids)
    # dbsList = dbsList + [dbsList[len(dbsList) - 1] for i in range(100 - len(dbsList))]
    print('dbsList', dbsList)
    return centroids, label, dbsList
Beispiel #13
0
def run_trial(X, labels, eps, minPts, metric, V):
    errors = '"'

    # Run our dbscan
    start = time()
    if metric == 'seuclidean':
        db = DBSCAN(eps,
                    minPts,
                    metric=metric,
                    metric_params={'V': V},
                    n_jobs=6)
    else:
        db = DBSCAN(eps, minPts, metric=metric, n_jobs=6)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start
    perc_noise = np.sum(pred_labels == -1) / len(pred_labels)
    n_clust = pred_labels.max()

    # Remove noisy points
    clean_idx = np.where(pred_labels != -1)
    nn_preds = pred_labels[clean_idx]
    nn_labels = labels[clean_idx]
    nn_X = X[clean_idx]

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            ss_score = ss(X, pred_labels, metric=metric, V=V)
        else:
            ss_score = ss(X, pred_labels, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    try:
        nn_ari_score = ari(nn_preds, nn_labels)
    except Exception as e:
        errors += str(e) + '; '
        nn_ari_score = np.nan
    try:
        nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nn_nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V)
        else:
            nn_ss_score = ss(nn_X, nn_preds, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        nn_ss_score = np.nan
    try:
        nn_vrc_score = vrc(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_vrc_score = np.nan
    try:
        nn_dbs_score = dbs(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_dbs_score = np.nan

    errors += '"'

    return [
        metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score,
        nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score,
        vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors
    ]
Beispiel #14
0

def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax):
	data = read_sample(str(root)+'\\'+filenameData)
    
    kClusters = canoc(data, kmin, kmax)
    
    initial_medoids = rci(data, kClusters).initialize()

    kmedoids_instance = kmedoids(data, initial_medoids)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    predicted = kmedoids_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)

   # elbow_instance = elbow(data, kmin, kmax)
   # elbow_instance.process()
   # amount_clusters = elbow_instance.get_amount()  # most probable amount of clusters
   # wce = elbow_instance.get_wce()

kmedoidsWithScore(filenameData, filenameSilhouetteMean, filenameDBS, filenameCHS, k, metric, k_min, k_max)
def Kmeans(D,K,maxIter):
    m, n = np.shape(D)
    if K >= m:
        return D

    def _rand_center(data,k):
        n = data.shape[1] # features
        centroids = np.zeros((k,n)) # init with (0,0)....
        for i in range(n):
            dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
            centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
        return centroids

    U = _rand_center(D, K)
    # initSet = set()
    # curK = K
    # while(curK>0):  # 随机选取k个样本
    #     randomInt = random.randint(0, m-1)
    #     if randomInt not in initSet:
    #         curK -= 1
    #         initSet.add(randomInt)
    
    C = np.zeros(m)
    curIter = maxIter  # 最大的迭代次数
    dbsList = [float('inf')]
    while curIter > 0:
        curIter -= 1
        # 计算样本到各均值向量的距离
        for i in range(m):
            p = 0
            minDistance = distance(D[i], U[0])
            for j in range(1, K):
                if distance(D[i], U[j]) < minDistance:
                    p = j
                    minDistance = distance(D[i], U[j])
            C[i] = p
        newU = np.zeros((K, n))
        cnt = np.zeros(K)

        for i in range(m):
            newU[int(C[i])] = newU[int(C[i])] + D[i]
            cnt[int(C[i])] += 1
        dbsList.append(dbs(D, C))
        changed = 0
        print('newU', newU)
        print('cnt', cnt)
        # 判断质心是否发生变化,如果发生变化则继续迭代,否则结束
        for i in range(K):
            newU[i] /= cnt[i]
            for j in range(n):
                if U[i, j] != newU[i, j]:
                    changed = 1
                    U[i, j] = newU[i, j]
        if changed == 0:
            cluster = [[D[i] for i, j in enumerate(C) if (j == k)] for k in range(K)]
            # indexCluster = [[i + 1 for i, j in enumerate(C) if (j == k)] for k in range(K)]
            lastList = [dbsList[len(dbsList) - 1] for i in range(curIter)]
            dbsList = dbsList + lastList
            return U, C, maxIter-curIter
    cluster = [[D[i]  for i, j in enumerate(C) if (j == k)] for k in range(K)]
    # indexCluster = [[i + 1 for i, j in enumerate(C) if (j == k)] for k in range(K)]

    return U, C, maxIter-curIter