Esempio n. 1
0
def PSOClusteringAlgorithm(datapoints, dataname, run):
    """---------------------------------------------------------------------------------------
    :function:  PSO clustering algorithm
    :parameter: datapoints: data points
                dataname:   data set's name
                run:  indicates which time is it and use to set the seed for random
    :return:    lables: the labels of data points assigned by the algorithm
                k:      number of cluster detected
    ------------------------------------------------------------------------------------------"""
    band_dict = {"data/aggregation.txt": 1.5, "data/flame.txt": 1.3, "data/DS850.txt": 0.3, "data/R15.txt": 0.3,
                 "data/D31.txt": 0.5, "data/dim512.txt": 0.40, "data/iris.txt": 0.09, "data/wdbc.txt": 0.09,
                 "data/seeds.txt": 0.10, "data/segmentation_all.txt": 0.16, "data/ecoli.txt": 0.07,
                 "data/appendicitis.txt": 0.12}

    spread_dict = {"data/aggregation.txt": 1.8, "data/flame.txt": 2.2, "data/DS850.txt": 0.3, "data/R15.txt": 0.5,
                   "data/D31.txt": 1.0, "data/dim512.txt": 0.40, "data/iris.txt": 0.10, "data/wdbc.txt": 0.08,
                   "data/seeds.txt": 0.20, "data/segmentation_all.txt": 0.15, "data/ecoli.txt": 0.09,
                   "data/appendicitis.txt": 0.18}
    # use for drawing figures
    num_dict = {"data/aggregation.txt": 7, "data/flame.txt": 3, "data/DS850.txt": 5, "data/R15.txt": 15,
                "data/D31.txt": 31, "data/iris.txt": 0.10, "data/wdbc.txt": 0.08, "data/seeds.txt": 0.20,
                "data/segmentation_all.txt": 0.15, "data/ecoli.txt": 0.09}

    T = 100  # total time steps
    vmax = [0.0 for i in range(len(datapoints[0]))]
    axis_range = [[0.0, 0.0] for i in range(len(datapoints[0]))]

    c1 = 1.0  # acceleration constant for cognitive learning
    c2 = 0.5  # acceleration constant for social learning

    # obtain the range on each dimension and the maximum velocity on each dimension
    dp = np.array(datapoints)
    for i in range(len(dp[0])):
        tmp = dp[0:len(dp), i:i+1]
        max_val = tmp.max()
        min_val = tmp.min()
        if vmax[i] < (max_val - min_val) * 0.10:
            vmax[i] = (max_val - min_val) * 0.10
        axis_range[i][0] = min_val - 1
        axis_range[i][1] = max_val + 1

    np.random.seed(run)  # seed for random

    X = np.array(datapoints)  # record particles" position
    datapoints = X.tolist()
    n = len(X)                # data set scale
    d = len(X[0])             # dimension

    # calculate the density of particle using KDE
    bandwidth = band_dict[dataname]  # select bandwidth
    kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(X.copy())
    f = [math.exp(a) for a in kde.score_samples(X)]  # densities of datapoints in KDE

    # train the RBF network
    t = np.array([[var] for var in f])
    net = newrb.designrb(X, t, 0, spread_dict[filename], round(0.1*n), n)
    print("Building RBF network is done!")

    for i in range(n):
        f[i] = net(X[i].reshape((1, d)))

    order = sorted(enumerate(f), key=lambda x: x[1], reverse=True)  # outliers' assignment order
    pbest = np.array(X)  # pbest
    pbest_density = copy.deepcopy(f)    # pbest's density
    # calculate leader for each particle
    neibrs_indices = calculate_neighborhood(pbest, n)  # neighbors for each data point
    lbest_index = [i for i in range(n)]  # the lbest's index
    lbest = np.array(X)  # lbest's current position
    k = 2  # number of leader that each particle almost can follow
    neibrs = []  # store the leader set for each particle
    for i in range(n):
        tmp = []
        cnt = 0
        for j in range(1, n):
            if f[i] < f[neibrs_indices[i][j]]:
                tmp.append(neibrs_indices[i][j])
                cnt += 1
            if cnt == k or j == n - 1:
                if not tmp:
                    tmp.append(neibrs_indices[i][0])
                neibrs.append(tmp)
                lbest_index[i] = tmp[0]  # select the first particle in its leader set as the initial leader
                lbest[i] = np.array(X[lbest_index[i]])  # obtain the position of lbest
                break
    # accquire the mean distance between particle and its leader, and the distance list of each pair
    avg_distance, distance_list = PublicFunctions.getAvgerageDistance(lbest_index, datapoints)
    bound = [0] * n     # use to indicate whether the particle is merged with its leader, '1' means merged, '0' means no

    velocity = np.array([np.array([0.0 for i in range(0, d)]) for j in range(0, n)])  # initialize the velocity to 0.0
    w = [1.0] * n   # initial inertial weight to 1.0

    for t in range(T):  # start iteration
        pbest_Mat = pbest - X
        lbest_Mat = lbest - X
        # update velocity and position
        for i in range(n):
            # if the particle is merged with its leader, it do not need to fly, just change its position to its leader
            if bound[i]:
                continue
            for j in range(d):
                r1 = np.random.random()
                r2 = np.random.random()
                # update velocity
                velocity[i][j] = w[i]*velocity[i][j] + c1 * r1 * (pbest_Mat[i][j]) + c2 * r2 * (lbest_Mat[i][j])
                # velocity clamped
                # v = vmax[j] - vmin[j]
                if velocity[i][j] > vmax[j]:
                    velocity[i][j] = vmax[j]
                elif velocity[i][j] < -vmax[j]:
                    velocity[i][j] = -vmax[j]
                X[i][j] = X[i][j] + velocity[i][j]  # calculate trial position
            # use RBFnn to estimate the density of the trail position
            new_density = net(X[i].reshape((1, d)))
            w[i] = new_density/pbest_density[i]
            # if the trial position is not bad than pbest, update the position to trail position
            f[i] = new_density
            if new_density >= pbest_density[i]:
                continue
            delta = ((pbest_density[i] - new_density)/pbest_density[i])*10   # calculate delta
            # according to the density difference between trail and pbest, adopt N~(0, 1/np.sqrt(2*pi)) to calculate
            # probability
            prob = stats.norm(0, 1/np.sqrt(2*np.pi)).pdf(delta)
            rand = np.random.random()   # generate a random number
            if rand > prob:     # if the random number is higher than the probability, the particle return to pbest
                velocity[i] = 0.0   # velocity reset to 0.0
                X[i] = pbest[i]     # position update to pbest
                f[i] = pbest_density[i]     # current density update to pbest's density
                ind = (neibrs[i].index(lbest_index[i]) + 1) % len(neibrs[i])    # select the next particle as leader
                lbest_index[i] = neibrs[i][ind]
        # end-for

        # update pbest in order
        for i in range(n):
            if bound[order[i][0]]:
                pbest[order[i][0]] = np.array(pbest[lbest_index[order[i][0]]])
            # update pbest
            elif f[order[i][0]] > pbest_density[order[i][0]]:
                pbest_density[order[i][0]] = f[order[i][0]]
                pbest[order[i][0]] = np.array(X[order[i][0]])

        # end-for
        # jude whether the particle should merged to its leader, if yes, update its pbest to its leader
        mark = [0] * n  # mark whether the particle should merge with its leader
        for i in range(n):
            if not bound[i] and PublicFunctions.getDistance(pbest[i], pbest[lbest_index[i]]) <= avg_distance:
                mark[i] = 1
        # end-for
        for i in range(n):
            if mark[order[i][0]] or bound[order[i][0]]:
                pbest[order[i][0]] = np.array(pbest[lbest_index[order[i][0]]])
                X[order[i][0]] = np.array(X[lbest_index[order[i][0]]])
                bound[order[i][0]] = 1
            # update the lbest's position
            lbest[order[i][0]] = np.array(X[lbest_index[order[i][0]]])
        # end-for

        if t == T - 1:
            labels, k = labeling(datapoints, bound, lbest_index, order)

            # Draw the final clustering result
            # PublicFunctions.drawClusteringResultGraph(pl, datapoints, labels, k, axis_range)
            # pl.show()
    # end-for
    return labels, k