def compute_var_sing(df, centroids): """ Compute every internal variance in clusters; clusters are found in df, whereas centroids are saved in centroids. :param df: input dataframe built by agg_clust/agg_clust_mod, listing the cluster and the x and y coordinates of each point. :param centroids: dataframe of the centroids of clusters, with their x and y coordinates. :return var_int: list of intra-cluster variances. """ even_num = [i for i in range(2, len(df) + 1) if i % 2 == 0] var_int = [] for i in list(df.index): az = df.loc[i].values z1 = [i for i in even_num if i <= len(az)] az = [az[:z1[0]]] + [az[z1[i]:z1[i + 1]] for i in range(len(z1) - 1)] az = [az[i] for i in range(len(az)) if np.isinf(az[i]).sum() != 2] internal_dist = [] for el in az: distance = (dist1(el, centroids.loc[i, ["cx", "cy"]].values))**2 internal_dist.append(distance) var_int.append(np.sum(internal_dist)) return var_int
def sl_dist(a, b): """Distance for single_linkage method, i.e. min[dist(x,y)] for x in a & y in b """ distances = [] for i in a: for j in b: distances.append(dist1(i, j)) distances = [i for i in distances if np.isnan(i) == False] return np.min(distances)
def compute_ward_ij(data, df): """ Compute difference in total within-cluster variance, with squared euclidean distance, and finds the best cluster according to Ward criterion. :param data: input data array. :param df: input dataframe built by agg_clust/agg_clust_mod, listing the cluster and the data and y coordinates of each point. :return: (i,j) indices of best cluster (the one for which the increase in intra-cluster variance is minimum) new_summ: new total intra-cluster variance par_var: increment in total intra-cluster variance, i.e. minimum increase in total intra-cluster variance """ even_num = [i for i in range(2, len(data) + 1) if i % 2 == 0] (centroids, summ) = compute_var(data, df) variances = {} k = 0 ind = list(df.index) partial_var = {} for i in ind: for j in ind[k:]: if i != j: az = df.loc[i].values bz = df.loc[j].values z1 = [i for i in even_num if i <= len(az)] z2 = [i for i in even_num if i <= len(bz)] az = [az[:z1[0]] ] + [az[z1[i]:z1[i + 1]] for i in range(len(z1) - 1)] bz = [bz[:z2[0]] ] + [bz[z2[i]:z2[i + 1]] for i in range(len(z2) - 1)] d = az + bz valid = [ d[i] for i in range(len(d)) if np.isinf(d[i]).sum() != 2 ] # print(valid) centroid = np.mean(valid, axis=0) var_int_par = [] for el in valid: var_int_par.append(dist1(el, centroid)**2) var_intz = np.sum(var_int_par) partial_var[(i, j)] = var_intz - centroids.loc[i][ "var"] - centroids.loc[j]["var"] var_new = summ + partial_var[(i, j)] variances[(i, j)] = var_new k += 1 (i, j) = min(variances, key=variances.get) new_summ = np.min(list(variances.values())) par_var = partial_var[(i, j)] if new_summ == summ: print("wrong") return (i, j), new_summ, par_var
def avg_dist(a, b): """Distance for average_linkage method, i.e. mean[dist(x,y)] for x in a & y in b """ distances = [] for i in a: for j in b: distances.append(dist1(i, j)) distances = [ i for i in distances if (np.isnan(i) == False) and (np.isinf(i) == False) ] return np.mean(distances)
def cl_dist(a, b): """Distance for complete_linkage method, i.e. max[dist(x,y)] for x in a & y in b """ distances = [] for i in a: for j in b: distances.append(dist1(i, j)) distances = [ i for i in distances if (np.isnan(i) == False) and (np.isinf(i) == False) ] return np.max(distances)
def dist_clust_cure(rep_u, rep_v): """ Compute the distance of two clusters based on the minimum distance found between the representatives of one cluster and the ones of the other. :param rep_u: list of representatives of the first cluster :param rep_v: list of representatives of the second cluster :return: distance between two clusters """ rep_u = np.array(rep_u) rep_v = np.array(rep_v) distances = [] for i in rep_u: for j in rep_v: distances.append(dist1(i, j)) return np.min(distances)
def scan_neigh1_mod(data, point, eps): """ Neighborhood search for a point of a given dataset-dictionary (data) with a fixed eps; it returns also the point itself, differently from scan_neigh1 of OPTICS. :param data: input dictionary. :param point: point whose neighborhood is to be examined. :param eps: radius of search. :return: dictionary of neighborhood points. """ neigh = {} for i, element in enumerate(data.values()): d = dist1(element, point) if d <= eps: neigh.update({str(i): element}) return neigh
def sel_rep_fast(prec_reps, clusters, name, c, alpha): """ Select c representatives of the clusters from the previously computed representatives, so it is faster than sel_rep. :param prec_reps: list of previously computed representatives. :param clusters: dictionary of clusters. :param name: name of the cluster we want to select representatives from. :param c: number of representatives we want to extract. :param alpha: 0<=float<=1, it determines how much the representative points are moved toward the centroid: 0 means they aren't modified, 1 means that all points collapse to the centroid. :return others: list of representative points. """ com = np.mean(clusters[name], axis=0) # if the cluster has c points or less, just take all of them as representatives and shrink them # according to the parameter alpha if len(prec_reps) <= c: others = prec_reps for i in range(len(others)): others[i] = others[i] + alpha * (com - others[i]) return others # if the cluster has more than c points, use the procedure described in the documentation to pick # the representative points else: others = [] # the representatives indexes = [] # their indexes, to avoid picking one point multiple times points = prec_reps # use old representatives distances_com = {i: dist1(points[i], com) for i in range(len(points))} index = max(distances_com, key=distances_com.get) indexes.append(index) others.append(np.array(points[index])) # first point # selecting the other c-1 points for step in range(min(c - 1, len(points) - 1)): # here we store the distances of the current point from the alredy selected representatives partial_distances = {str(i): [] for i in range(len(points))} for i in range(len(points)): if i not in indexes: for k in range(len(others)): partial_distances[str(i)].append([dist1(points[i], np.array(others[k]))]) partial_distances = dict((k, [np.sum(v)]) for k, v in partial_distances.items()) index2 = max(partial_distances, key=partial_distances.get) indexes.append(int(index2)) others.append( points[int(index2)]) # other points are the farthest from the already selected representatives # perform the shrinking according to the parameter alpha for i in range(len(others)): others[i] = others[i] + alpha * (com - others[i]) return others
def point_plot_mod2(X, a, reps, level_txt, level2_txt=None, par_index=None, u=None, u_cl=None, initial_ind=None, last_reps=None, not_sampled=None, not_sampled_ind=None, n_rep_fin=None): """ Scatter-plot of input data points, colored according to the cluster they belong to. A rectangle with red borders is displayed around the last merged cluster; representative points of last merged cluster are also plotted in red, along with the center of mass, plotted as a red cross. The current number of clusters and current distance are also displayed in the right upper corner. In the last phase of CURE algorithm variation for large datasets, arrows are displayed from every not sampled point to its closest representative point; moreover, representative points are surrounded by small circles, to make them more visible. Representative points of different clusters are plotted in different nuances of red. :param X: input data array. :param a: input dataframe built by CURE algorithm, listing the cluster and the x and y coordinates of each point. :param reps: list of the coordinates of representative points. :param level_txt: distance at which current merging occurs displayed in the upper right corner. :param level2_txt: incremental distance (not used). :param par_index: partial index to take the shuffling of indexes into account. :param u: first cluster to be merged. :param u_cl: second cluster to be merged. :param initial_ind: initial partial index. :param last_reps: dictionary of last representative points. :param not_sampled: coordinates of points that have not been initially sampled, in the large dataset version. :param not_sampled_ind: indexes of not_sampled point_indices. :param n_rep_fin: number of representatives to use for each cluster in the final assignment phase in the large dataset version. :return list_keys_diz: if par_index is not None, returns the new indexes of par_index. """ # diz is used to take the shuffling of data into account, e.g. if the first row doesn'# # correspond to point 0: this is useful for the large dataset version of CURE, where data points # are randomly sampled, but the initial indices are kept to be plotted. if par_index is not None: diz = dict(zip(par_index, [i for i in range(len(par_index))])) fig, ax = plt.subplots(figsize=(14, 6)) # points that still need to be processed are plotted in lime color plt.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black") # drops the totally null columns, so that the number of columns goes to 2*(cardinality of biggest cluster) a = a.dropna(1, how="all") colors = {0: "seagreen", 1: 'lightcoral', 2: 'yellow', 3: 'grey', 4: 'pink', 5: 'turquoise', 6: 'orange', 7: 'purple', 8: 'yellowgreen', 9: 'olive', 10: 'brown', 11: 'tan', 12: 'plum', 13: 'rosybrown', 14: 'lightblue', 15: "khaki", 16: "gainsboro", 17: "peachpuff"} color_dict_rect = convert_colors(colors, alpha=0.3) # to speed things up, this splits all points inside the clusters' names, and start gives the starting index # that shows where clusters with more than 1 element start (because they are always appended to a) len_ind = [len(i.split("-")) for i in list(a.index)] start = np.min([i for i in range(len(len_ind)) if len_ind[i] > 1]) # for each cluster, take the single points composing it and plot them in the appropriate color, if # necessary taking the labels of par_index into account for ind, i in enumerate(range(start, len(a))): point = a.iloc[i].name.replace("(", "").replace(")", "").split("-") if par_index is not None: X_clust = [X[diz[point[j]], 0] for j in range(len(point))] Y_clust = [X[diz[point[j]], 1] for j in range(len(point))] ax.scatter(X_clust, Y_clust, s=350, color=colors[ind % 18]) else: point = [int(i) for i in point] X_clust = [X[point[j], 0] for j in range(len(point))] Y_clust = [X[point[j], 1] for j in range(len(point))] ax.scatter(X_clust, Y_clust, s=350, color=colors[ind % 18]) # last merged cluster, so the last element of matrix a point = a.iloc[-1].name.replace("(", "").replace(")", "").split("-") # finding the new center of mass the newly merged cluster if par_index is not None: point = [diz[point[i]] for i in range(len(point))] com = X[point].mean(axis=0) else: point = [int(i) for i in point] com = X[point].mean(axis=0) # plotting the center of mass, marked with an X plt.scatter(com[0], com[1], s=400, color="r", marker="X", edgecolor="black") # plotting representative points in red x_reps = [i[0] for i in reps] y_reps = [i[1] for i in reps] plt.scatter(x_reps, y_reps, s=360, color="r", edgecolor="black") # finding the right measures for the rectangle rect_min = X[point].min(axis=0) rect_diff = X[point].max(axis=0) - rect_min xmin, xmax, ymin, ymax = plt.axis() xwidth = xmax - xmin ywidth = ymax - ymin # adding the rectangle, using two rectangles one above the other to use different colors # for the border and for the inside if len(point) <= 2: ax.add_patch(Rectangle((rect_min[0] - xwidth * 0.02, rect_min[1] - ywidth * 0.04), rect_diff[0] + xwidth * 0.04, rect_diff[1] + ywidth * 0.08, fill=True, color=color_dict_rect[ind % 18], linewidth=3, ec="red")) else: encircle(X_clust, Y_clust, ax=ax, color=color_dict_rect[ind % 18], linewidth=3, ec="red") # adding labels to points in the plot if initial_ind is not None: for i, txt in enumerate(initial_ind): ax.annotate(txt, (X[:, 0][i], X[:, 1][i]), fontsize=10, size=10, ha='center', va='center') else: for i, txt in enumerate([i for i in range(len(X))]): ax.annotate(txt, (X[:, 0][i], X[:, 1][i]), fontsize=10, size=10, ha='center', va='center') # adding the annotations ax.annotate("min_dist: " + str(round(level_txt, 5)), (xmax * 0.75, ymax * 0.9), fontsize=12, size=12) if level2_txt is not None: ax.annotate("dist_incr: " + str(round(level2_txt, 5)), (xmax * 0.75, ymax * 0.8), fontsize=12, size=12) ax.annotate("n° clust: " + str(len(a)), (xmax * 0.75, ymax * 0.7), fontsize=12, size=12) plt.show() # everything down from here refers to the last phase of the large dataset version, the assignment phase if last_reps is not None: fig, ax = plt.subplots(figsize=(14, 6)) # plot all the points in color lime plt.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black") # find the centers of mass of the clusters using the matrix a to find which points belong to # which cluster coms = [] for ind, i in enumerate(range(0, len(a))): point = a.iloc[i].name.replace("(", "").replace(")", "").split("-") for j in range(len(point)): plt.scatter(X[diz[point[j]], 0], X[diz[point[j]], 1], s=350, color=colors[ind % 18]) point = [diz[point[i]] for i in range(len(point))] coms.append(X[point].mean(axis=0)) # variations of red to plot the representative points of the various clusters colors_reps = ["red", "crimson", "indianred", "lightcoral", "salmon", "darksalmon", "firebrick"] # flattening the last_reps values flat_reps = [item for sublist in list(last_reps.values()) for item in sublist] # plotting the representatives, surrounded by small circles, and the centers of mass, marked with X for i in range(len(last_reps)): len_rep = len(list(last_reps.values())[i]) x = [list(last_reps.values())[i][j][0] for j in range(min(n_rep_fin, len_rep))] y = [list(last_reps.values())[i][j][1] for j in range(min(n_rep_fin, len_rep))] plt.scatter(x, y, s=400, color=colors_reps[i], edgecolor="black") plt.scatter(coms[i][0], coms[i][1], s=400, color=colors_reps[i], marker="X", edgecolor="black") for num in range(min(n_rep_fin, len_rep)): plt.gcf().gca().add_artist(plt.Circle((x[num], y[num]), xwidth * 0.03, color=colors_reps[i], fill=False, linewidth=3, alpha=0.7)) plt.scatter(not_sampled[:, 0], not_sampled[:, 1], s=400, color="lime", edgecolor="black") # find the closest representative for not sampled points, and draw an arrow connecting the points # to its closest representative for ind in range(len(not_sampled)): dist_int = [] for el in flat_reps: dist_int.append(dist1(not_sampled[ind], el)) ind_min = np.argmin(dist_int) plt.arrow(not_sampled[ind][0], not_sampled[ind][1], flat_reps[ind_min][0] - not_sampled[ind][0], flat_reps[ind_min][1] - not_sampled[ind][1], length_includes_head=True, head_width=0.03, head_length=0.05) # plotting the indexes for each point for i, txt in enumerate(initial_ind): ax.annotate(txt, (X[:, 0][i], X[:, 1][i]), fontsize=10, size=10, ha='center', va='center') if not_sampled_ind is not None: for i, txt in enumerate(not_sampled_ind): ax.annotate(txt, (not_sampled[:, 0][i], not_sampled[:, 1][i]), fontsize=10, size=10, ha='center', va='center') plt.show() # if par_index is not None, diz is updated with the last merged cluster and its keys are returned if par_index is not None: diz["(" + u + ")" + "-" + "(" + u_cl + ")"] = len(diz) list_keys_diz = list(diz.keys()) return list_keys_diz