예제 #1
0
    def __update_clusters(self):
        for i in range(
                self.max_iter
        ):  # to stop if convergence isn't reached whithin max_iter iterations

            self.log.appendPlainText("")
            self.log.appendPlainText("iteration n°: {}".format(i + 1))
            # compute distance obtained by swapping medoids in the clusters
            cluster_dist_with_new_medoids = self.__swap_and_recalculate_clusters(
            )
            # if the new sum of cluster_distances is smaller than the old one
            if self.__is_new_cluster_dist_small(
                    cluster_dist_with_new_medoids) is True:
                self.log.appendPlainText("new is smaller")
                # compute clusters and cluster_distance with new medoids
                self.clusters, self.cluster_distances = self.__calculate_clusters(
                    self.medoids)
                self.log.appendPlainText("clusters: {}".format(self.clusters))
                if self.delay != 0:
                    pause_execution(self.delay)
                self.plot_pam_gui(data=self.__data,
                                  cl=self.clusters,
                                  ax=self.ax,
                                  canvas=self.canvas,
                                  ind_run=self.ind_run,
                                  ind_fig=i + 1,
                                  save_plots=self.save_fig)
                # print("clusters_distances: ", self.cluster_distances)
            else:
                # if the sum of cluster_distances doesn't improve, terminate the algorithm
                self.log.appendPlainText("termination")
                break
예제 #2
0
    def __insert_data(self, plotting=False):
        """!
        @brief Inserts input data to the tree.

        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.

        """

        for index_point in range(0, len(self.__pointer_data)):
            if (index_point != 0) and (plotting is True):
                if self.delay != 0:
                    pause_execution(self.delay)
                plot_tree_fin_gui(tree=self.__tree, log=self.log, ind_run=self.ind_run,
                                  ind_fig=self.index_for_saving_plot, label_graphviz=self.label_graphviz,
                                  save_plots=self.save_fig)
                plot_birch_leaves_gui(tree=self.__tree, data=self.__pointer_data, ax=self.ax, canvas=self.canvas,
                                      ind_run=self.ind_run, ind_fig=self.index_for_saving_plot,
                                      save_plots=self.save_fig)
                self.index_for_saving_plot += 1

            self.log.appendPlainText("")
            self.log.appendPlainText("index: {}".format(index_point))
            point = self.__pointer_data[index_point]
            self.log.appendPlainText("point [{}, {}]".format(round(point[0], 2), round(point[1], 2)))
            self.__tree.insert_cluster([point])

            if self.__tree.amount_entries > self.__entry_size_limit:
                self.log.appendPlainText("rebuilding tree")
                self.__tree = self.__rebuild_tree(index_point)
예제 #3
0
    def __start_algo(self):
        self.log.appendPlainText("starting algorithm")
        self.__initialize_medoids()  # choosing initial medoids
        # computing clusters and cluster_distances
        self.clusters, self.cluster_distances = self.__calculate_clusters(self.medoids)
        # print cluster and cluster_distances
        self.log.appendPlainText("clusters: {}".format(self.clusters))
        self.log.appendPlainText(
            "clusters_distances: {}".format(self.cluster_distances)
        )

        if self.delay != 0:
            pause_execution(self.delay)

        self.plot_pam_gui(
            data=self.__data,
            cl=self.clusters,
            ax=self.ax,
            canvas=self.canvas,
            ind_run=self.ind_run,
            ind_fig=0,
            save_plots=self.save_fig,
        )

        self.__update_clusters()
    def plot2d_data_gui(self,
                        df,
                        canvas,
                        ax,
                        save_plots,
                        ind_fig=None,
                        col_i=None):

        if self.delay != 0:
            pause_execution(self.delay)

        ax.clear()
        ax.set_title(self.name + " Merging")

        colors = {
            0: "seagreen",
            1: "dodgerblue",
            2: "yellow",
            3: "grey",
            4: "pink",
            5: "turquoise",
            6: "orange",
            7: "purple",
            8: "yellowgreen",
            9: "olive",
            10: "brown",
            11: "tan",
            12: "plum",
            13: "rosybrown",
            14: "lightblue",
            15: "khaki",
            16: "gainsboro",
            17: "peachpuff",
            18: "lime",
            19: "peru",
            20: "beige",
            21: "teal",
            22: "royalblue",
            23: "tomato",
            24: "bisque",
            25: "palegreen",
        }

        color_list = [colors[i] for i in df["cluster"]]

        df.plot(kind="scatter", c=color_list, x=0, y=1, ax=ax, s=100)

        ax.set_xlabel("")
        ax.set_ylabel("")

        if col_i is not None:
            ax.scatter(
                df[df.cluster == col_i].iloc[:, 0],
                df[df.cluster == col_i].iloc[:, 1],
                color="black",
                s=140,
                edgecolors="white",
                alpha=0.8,
            )

        canvas.draw()

        if save_plots is True:
            canvas.figure.savefig(
                appctxt.get_resource("Images/") + "/" +
                "{}_{:02}/fig_{:02}.png".format(self.name, self.ind_run,
                                                ind_fig))

        QCoreApplication.processEvents()
    def plot2d_graph_gui(self,
                         graph,
                         canvas,
                         ax,
                         save_plots,
                         ind_fig=None,
                         print_clust=True):

        if self.delay != 0:
            pause_execution(self.delay)

        ax.clear()
        ax.set_title(self.name + " Graph Clustering")

        pos = nx.get_node_attributes(graph, "pos")
        colors = {
            0: "seagreen",
            1: "dodgerblue",
            2: "yellow",
            3: "grey",
            4: "pink",
            5: "turquoise",
            6: "orange",
            7: "purple",
            8: "yellowgreen",
            9: "olive",
            10: "brown",
            11: "tan",
            12: "plum",
            13: "rosybrown",
            14: "lightblue",
            15: "khaki",
            16: "gainsboro",
            17: "peachpuff",
            18: "lime",
            19: "peru",
            20: "beige",
            21: "teal",
            22: "royalblue",
            23: "tomato",
            24: "bisque",
            25: "palegreen",
        }

        el = nx.get_node_attributes(graph, "cluster").values()
        cmc = Counter(el).most_common()
        c = [colors[i % len(colors)] for i in el]

        if print_clust is True:
            self.log.appendPlainText("clusters: {}".format(cmc))

        if len(el) != 0:  # is set
            # print(pos)
            nx.draw(graph,
                    pos,
                    node_color=c,
                    node_size=60,
                    edgecolors="black",
                    ax=ax)
        else:
            nx.draw(graph, pos, node_size=60, edgecolors="black", ax=ax)

        canvas.draw()

        if save_plots is True:
            canvas.figure.savefig(
                appctxt.get_resource("Images/") + "/" +
                "{}_{:02}/fig_{:02}.png".format(self.name, self.ind_run,
                                                ind_fig))

        QCoreApplication.processEvents()
예제 #6
0
    def DBSCAN_gui(self, plotting=True, print_details=True, delay=0):
        """
        DBSCAN algorithm.

        :param plotting: if True, executes point_plot_mod, plotting every time a points is
                         added to a clusters
        :param print_details: if True, prints the length of the "external" NearestNeighborhood
                              and of the "internal" one (in the while loop).
        :param delay: seconds for which to delay the algorithm, so that the images displayes in the GUI
                      show at a slower pace.
        :return ClustDict: dictionary of the form point_index:cluster_label.

        """
        self.update_log(initial=True)
        index_for_saving_plots = 0

        # initialize dictionary of clusters
        self.ClustDict = {}

        clust_id = -1

        X_dict = dict(zip([str(i) for i in range(len(self.X))], self.X))

        processed = []

        processed_list = []

        # for every point in the dataset
        for point in X_dict:

            # if it hasnt been visited
            if point not in processed:
                # mark it as visited
                processed.append(point)
                # scan its neighborhood
                N = scan_neigh1_mod(X_dict, X_dict[point], self.eps)

                if print_details == True:
                    self.update_log(point,
                                    "  initial len(N): " + str(len(N)),
                                    change_current=True)
                    # print("len(N): ", len(N))
                # if there are less than minPTS in its neighborhood, classify it as noise
                if len(N) < self.mp:

                    self.ClustDict.update({point: -1})

                    if plotting == True:
                        if delay != 0:
                            pause_execution(delay)

                        self.point_plot_mod_gui(X_dict,
                                                point,
                                                save_plots=self.save_plots,
                                                ind_fig=index_for_saving_plots)
                        index_for_saving_plots += 1
                        self.update_log(noise=True)
                # else if it is a Core point
                else:
                    # increase current id of cluster
                    clust_id += 1
                    # put it in the cluster dictionary
                    self.ClustDict.update({point: clust_id})

                    if plotting == True:
                        if delay != 0:
                            pause_execution(delay)

                        self.point_plot_mod_gui(X_dict,
                                                point,
                                                save_plots=self.save_plots,
                                                ind_fig=index_for_saving_plots)
                        index_for_saving_plots += 1
                    # add it to the temporary processed list
                    processed_list = [point]
                    # remove it from the neighborhood N
                    del N[point]
                    # until the neighborhood is empty
                    while len(N) > 0:

                        # take a random point in neighborhood
                        n = random.choice(list(N.keys()))

                        if print_details == True:
                            self.update_log(n,
                                            "     updated len(N): " +
                                            str(len(N)),
                                            change_subcurrent=True)
                            # print("len(N) in while loop: ", len(N))

                        # but the point must not be in processed_list aka already visited
                        while n in processed_list:
                            n = random.choice(list(N.keys()))
                        # put it in processed_list
                        processed_list.append(n)
                        # remove it from the neighborhood
                        del N[n]
                        # if it hasnt been visited
                        if n not in processed:
                            # mark it as visited
                            processed.append(n)
                            # scan its neighborhood
                            N_2 = scan_neigh1_mod(X_dict, X_dict[n], self.eps)

                            if print_details == True:
                                self.update_log(
                                    point, "     len(N_sub): " + str(len(N_2)))
                                # print("len(N2): ", len(N_2))
                            # if it is a core point
                            if len(N_2) >= self.mp:
                                # add each element of its neighborhood to the neighborhood N
                                for element in N_2:

                                    if element not in processed_list:
                                        N.update({element: X_dict[element]})

                        # if n has not been inserted into cluster dictionary or if it has previously been
                        # classified as noise, update the cluster dictionary
                        if (n not in self.ClustDict) or (self.ClustDict[n]
                                                         == -1):
                            self.ClustDict.update({n: clust_id})

                        if plotting == True:
                            if delay != 0:
                                pause_execution(delay)
                            self.point_plot_mod_gui(
                                X_dict,
                                n,
                                save_plots=self.save_plots,
                                ind_fig=index_for_saving_plots)
                            index_for_saving_plots += 1
예제 #7
0
    def process(self, plotting=False):
        """!
        @brief Performs cluster analysis in line with rules of CLARANS algorithm.

        @return (clarans) Returns itself (CLARANS instance).

        @see get_clusters()
        @see get_medoids()

        """

        random.seed()
        index_for_saving_plots = 0

        # loop for a numlocal number of times
        for _ in range(0, self.__numlocal):

            self.log.appendPlainText("")
            self.log.appendPlainText("numlocal (iteration): {}".format(_ + 1))
            # set (current) random medoids
            self.__current = random.sample(range(0, len(self.__pointer_data)),
                                           self.__number_clusters)

            # update clusters in line with random allocated medoids
            self.__update_clusters(self.__current)

            # optimize configuration
            self.__optimize_configuration()

            # obtain cost of current cluster configuration and compare it with the best obtained
            estimation = self.__calculate_estimation()
            if estimation < self.__optimal_estimation:
                self.log.appendPlainText("Better configuration found with "
                                         "medoids: {0} and cost: {1}".format(
                                             self.__current[:], estimation))
                self.__optimal_medoids = self.__current[:]
                self.__optimal_estimation = estimation

                if plotting is True:
                    self.__update_clusters(self.__optimal_medoids)
                    if self.delay != 0:
                        pause_execution(self.delay)
                    self.PAM.plot_pam_gui(
                        data=self.__pointer_data,
                        name="CLARANS",
                        cl=dict(zip(self.__optimal_medoids, self.__clusters)),
                        ax=self.ax,
                        canvas=self.canvas,
                        ind_run=self.ind_run,
                        ind_fig=index_for_saving_plots,
                        save_plots=self.save_fig,
                    )

            else:
                self.log.appendPlainText(
                    "Configuration found does not improve current "
                    "best one because its cost is {0}".format(estimation))
                if plotting is True:
                    self.__update_clusters(self.__current[:])
                    if self.delay != 0:
                        pause_execution(self.delay)
                    self.PAM.plot_pam_gui(
                        data=self.__pointer_data,
                        cl=dict(zip(self.__current[:], self.__clusters)),
                        ax=self.ax,
                        canvas=self.canvas,
                        ind_run=self.ind_run,
                        name="CLARANS",
                        ind_fig=index_for_saving_plots,
                        save_plots=self.save_fig,
                    )

            index_for_saving_plots += 1
        self.__update_clusters(self.__optimal_medoids)

        if plotting is True:
            self.log.appendPlainText("")
            self.log.appendPlainText("FINAL RESULT")
            if self.delay != 0:
                pause_execution(self.delay)
            self.PAM.plot_pam_gui(
                data=self.__pointer_data,
                cl=dict(zip(self.__optimal_medoids, self.__clusters)),
                ax=self.ax,
                canvas=self.canvas,
                ind_run=self.ind_run,
                name="CLARANS",
                ind_fig=None,
                save_plots=self.save_fig,
            )

        return self
예제 #8
0
    def cure_gui(
        self,
        data,
        k,
        ax,
        canvas,
        plotting=True,
        preprocessed_data=None,
        partial_index=None,
        n_rep_finalclust=None,
        not_sampled=None,
        not_sampled_ind=None,
        delay=0,
        ind_fig_bis=None,
    ):
        """
        CURE algorithm: hierarchical agglomerative clustering using representatives.
        :param data: input data.
        :param plotting: if True, plots all intermediate steps.
        :param k: the desired number of clusters.

        #the following parameter are used for the large dataset variation of CURE

        :param preprocessed_data: if not None, must be of the form (clusters,representatives,matrix_a,X_dist1),
                                  which is used to perform a warm start.
        :param partial_index: if not None, is is used as index of the matrix_a, of cluster points and of
                              representatives.
        :param n_rep_finalclust: the final representative points used to classify the not_sampled points.
        :param not_sampled: points not sampled in the initial phase.
        :param not_sampled_ind: indexes of not_sampled points.
        :return (clusters, rep, a): returns the clusters dictionary, the dictionary of representatives,
                                    the matrix a


        """
        ax.cla()

        index_for_saving_plots = 0
        # starting from raw data
        if preprocessed_data is None:
            # building a dataframe storing the x and y coordinates of input data points
            l = [[i, i] for i in range(len(data))]
            flat_list = [item for sublist in l for item in sublist]
            col = [
                str(el) + "x" if i % 2 == 0 else str(el) + "y"
                for i, el in enumerate(flat_list)
            ]

            # using the original indexes if necessary
            if partial_index is not None:
                a = pd.DataFrame(index=partial_index, columns=col)
            else:
                a = pd.DataFrame(index=[str(i) for i in range(len(data))],
                                 columns=col)

            # adding the real coordinates
            a["0x"] = data.T[0]
            a["0y"] = data.T[1]

            b = a.dropna(axis=1, how="all")

            # initial clusters
            if partial_index is not None:
                clusters = dict(zip(partial_index, data))
            else:
                clusters = {
                    str(i): np.array(data[i])
                    for i in range(len(data))
                }

            # build Xdist
            X_dist1 = dist_mat_gen(b)

            # initialize representatives
            if partial_index is not None:
                rep = {
                    partial_index[i]: [data[int(i)]]
                    for i in range(len(data))
                }
            else:
                rep = {str(i): [data[i]] for i in range(len(data))}

            # just as placeholder for while loop
            heap = [1] * len(X_dist1)

            # store minimum distances between clusters for each iteration
            levels = []

        # use precomputed data
        else:

            clusters = preprocessed_data[0]
            rep = preprocessed_data[1]
            a = preprocessed_data[2]
            X_dist1 = preprocessed_data[3]
            heap = [1] * len(X_dist1)
            levels = []

        # store original index
        if partial_index is not None:
            initial_index = deepcopy(partial_index)

        # while the desired number of clusters has not been reached
        while len(heap) > k:

            # find minimum value of heap queu, which stores clusters according to the distance from
            # their closest cluster
            list_argmin = list(X_dist1.apply(lambda x: np.argmin(x)).values)
            list_min = list(X_dist1.min(axis=0).values)
            heap = dict(zip(list(X_dist1.index), list_min))
            heap = dict(OrderedDict(sorted(heap.items(),
                                           key=lambda kv: kv[1])))
            closest = dict(zip(list(X_dist1.index), list_argmin))

            # get minimum keys and delete them from heap and closest dictionaries
            u = min(heap, key=heap.get)
            levels.append(heap[u])
            del heap[u]
            # u_cl = closest[u]
            u_cl = X_dist1.columns[closest[u]]
            del closest[u]

            # form the new cluster
            if (np.array(clusters[u]).shape == (2, )) and (np.array(
                    clusters[u_cl]).shape == (2, )):
                w = [clusters[u], clusters[u_cl]]
            elif (np.array(clusters[u]).shape !=
                  (2, )) and (np.array(clusters[u_cl]).shape == (2, )):
                clusters[u].append(clusters[u_cl])
                w = clusters[u]
            elif (np.array(clusters[u]).shape
                  == (2, )) and (np.array(clusters[u_cl]).shape != (2, )):
                clusters[u_cl].append(clusters[u])
                w = clusters[u_cl]
            else:
                w = clusters[u] + clusters[u_cl]

            # delete old cluster
            del clusters[u]
            del clusters[u_cl]

            # set new name
            name = "(" + u + ")" + "-" + "(" + u_cl + ")"
            clusters[name] = w

            # update representatives
            rep[name] = sel_rep_fast(rep[u] + rep[u_cl], clusters, name,
                                     self.n_repr, self.alpha_cure)

            # update distance matrix
            X_dist1 = update_mat_cure(X_dist1, u, u_cl, rep, name)

            # delete old representatives
            del rep[u]
            del rep[u_cl]

            if plotting is True:
                if delay != 0:
                    pause_execution(self.delay)

                dim1 = int(a.loc[u].notna().sum())
                # update the matrix a with the new cluster
                a.loc["(" + u + ")" + "-" + "(" + u_cl +
                      ")", :] = a.loc[u].fillna(0) + a.loc[u_cl].shift(
                          dim1, fill_value=0)
                a = a.drop(u, 0)
                a = a.drop(u_cl, 0)

                # in the large dataset version of CURE
                if partial_index is not None:

                    # only in last step of large dataset version of CURE
                    if ((len(heap) == k) and (not_sampled is not None)
                            and (not_sampled_ind is not None)):

                        # take random representative points from the final representatives
                        final_reps = {
                            list(rep.keys())[i]: random.sample(
                                list(rep.values())[i],
                                min(n_rep_finalclust,
                                    len(list(rep.values())[i])),
                            )
                            for i in range(len(rep))
                        }

                        partial_index = self.point_plot_mod2_gui(
                            data=data,
                            a=a,
                            reps=rep[name],
                            ax=ax,
                            canvas=canvas,
                            level_txt=levels[-1],
                            par_index=partial_index,
                            u=u,
                            u_cl=u_cl,
                            initial_ind=initial_index,
                            last_reps=final_reps,
                            not_sampled=not_sampled,
                            not_sampled_ind=not_sampled_ind,
                            n_rep_fin=n_rep_finalclust,
                            save_plots=self.save_plots,
                            ind_fig=index_for_saving_plots,
                            ind_fig_bis=ind_fig_bis,
                        )

                    # in the intermediate steps of the large dataset version
                    else:
                        partial_index = self.point_plot_mod2_gui(
                            data=data,
                            a=a,
                            reps=rep[name],
                            ax=ax,
                            canvas=canvas,
                            level_txt=levels[-1],
                            par_index=partial_index,
                            u=u,
                            u_cl=u_cl,
                            initial_ind=initial_index,
                            save_plots=self.save_plots,
                            ind_fig=index_for_saving_plots,
                            ind_fig_bis=ind_fig_bis,
                        )
                else:
                    self.point_plot_mod2_gui(
                        a=a,
                        reps=rep[name],
                        ax=ax,
                        canvas=canvas,
                        level_txt=levels[-1],
                        save_plots=self.save_plots,
                        ind_fig=index_for_saving_plots,
                        ind_fig_bis=ind_fig_bis,
                    )

            index_for_saving_plots += 1

        return clusters, rep, a
예제 #9
0
    def clara(self, _df, _k, _fn):
        """The main clara clustering iterative algorithm.
        :param _df: Input dataframe.
        :param _k: Number of medoids.
        :param _fn: The distance function to use.
        :return: The minimized cost, the best medoid choices and the final configuration.
        """
        _df = pd.DataFrame(_df)
        size = len(_df)
        if size > 100000:
            niter = 1000
            runs = 1
        else:
            niter = self.max_iter
            runs = 5

        # initialize min_avg_cost to infinity
        min_avg_cost = np.inf
        best_choices = []
        best_results = {}

        index_for_saving_plot = 0

        for j in range(runs):  # usually 5 times
            self.log.appendPlainText("")
            self.log.appendPlainText("run number: {}".format(j))
            # take 40+_k*2 random indexes from input data
            if size < (40 + _k * 2):
                self.log.clear()
                self.log.appendPlainText("ERROR")
                self.log.appendPlainText("")
                self.log.appendPlainText("The dimension of the input dataset must be at least 40 + 2*n_medoids")
                return
            else:
                sampling_idx = random.sample([i for i in range(size)], 40 + _k * 2)
            # take the corresponding rows from input dataframe _df
            # prov_dic = {i: sampling_idx[i] for i in range(40 + _k * 2)}
            # print(prov_dic)
            sampling_data = []
            for idx in sampling_idx:
                sampling_data.append(_df.iloc[idx])

            # create the sample dataframe
            sampled_df = pd.DataFrame(sampling_data, index=sampling_idx)

            # return total cost, medoids and clusters of sampled_df
            pre_cost, pre_choice, pre_medoids = self.k_medoids(sampled_df, _k, _fn, niter)
            if self.delay != 0:
                pause_execution(self.delay)

            self.plot_pam_mod_gui(data=sampled_df, ax=self.ax, canvas=self.canvas, cl=pre_medoids, full=_df,
                                  ind_run=self.ind_run, ind_fig=index_for_saving_plot, save_plots=self.save_fig)

            self.log.appendPlainText("")
            self.log.appendPlainText("RESULTS OF K-MEDOIDS")
            self.log.appendPlainText("pre_cost: {}".format(pre_cost))
            self.log.appendPlainText("pre_choice: {}".format(pre_choice))
            self.log.appendPlainText("pre_medoids: {}".format(pre_medoids))

            # compute average cost and clusters of whole input dataframe
            tmp_avg_cost, tmp_medoids = self.average_cost(_df, _fn, pre_choice)

            self.log.appendPlainText("")
            self.log.appendPlainText("RESULTS OF WHOLE DATASET EVALUATION")
            self.log.appendPlainText("tmp_avg_cost: {}".format(tmp_avg_cost))
            self.log.appendPlainText("tmp_medoids: {}".format(tmp_medoids))
            # if the new cost is lower
            if tmp_avg_cost < min_avg_cost:
                self.log.appendPlainText("new_cost is lower, from {0} to {1}".format(round(min_avg_cost, 4),
                                                                                     round(tmp_avg_cost, 4)))
                min_avg_cost = tmp_avg_cost
                best_choices = list(pre_choice)
                best_results = dict(tmp_medoids)

            elif tmp_avg_cost == min_avg_cost:
                self.log.appendPlainText("new_cost is equal")
            else:
                self.log.appendPlainText("new_cost is higher")

            index_for_saving_plot += 1

        self.log.appendPlainText("")
        self.log.appendPlainText("FINAL RESULT")
        if self.delay != 0:
            pause_execution(self.delay)
        self.plot_pam_mod_gui(data=_df, ax=self.ax, canvas=self.canvas, cl=best_results, full=_df,
                              ind_run=self.ind_run, ind_fig=None, save_plots=self.save_fig)

        return min_avg_cost, best_choices, best_results
예제 #10
0
    def agg_clust_mod_gui(self, delay=0):
        """
        Perform hierarchical agglomerative clustering with the provided linkage method, plotting every step
        of cluster aggregation.

        :param delay: seconds for which to delay the algorithm, so that the images displayes in the GUI
                      show at a slower pace.

        """

        levels = []
        levels2 = []
        ind_list = []
        index_for_saving_plots = 0

        # build matrix a, used to store points of clusters with their coordinates
        l = [[i, i] for i in range(len(self.X))]
        flat_list = [item for sublist in l for item in sublist]
        col = [
            str(el) + "x" if i % 2 == 0 else str(el) + "y"
            for i, el in enumerate(flat_list)
        ]

        a = pd.DataFrame(index=[str(i) for i in range(len(self.X))],
                         columns=col)

        a["0x"] = self.X.T[0]
        a["0y"] = self.X.T[1]

        b = a.dropna(axis=1, how="all")

        # initial distance matrix
        X_dist1 = dist_mat_gen(b)
        var_sum = 0
        levels.append(var_sum)
        levels2.append(var_sum)

        # until the desired number of clusters is reached
        while len(a) > self.n_clust:

            if self.linkage == "ward":
                # find indexes corresponding to the minimum increase in total intra-cluster variance
                b = a.dropna(axis=1, how="all")
                b = b.fillna(np.inf)
                ((i, j), var_sum, par_var) = compute_ward_ij(self.X, b)

                levels.append(var_sum)
                levels2.append(par_var)
                ind_list.append((i, j))
                new_clust = a.loc[[i, j], :]

            else:
                # find indexes corresponding to the minimum distance
                (i, j) = np.unravel_index(
                    np.array(X_dist1).argmin(),
                    np.array(X_dist1).shape)
                levels.append(np.min(np.array(X_dist1)))
                ind_list.append((i, j))
                new_clust = a.iloc[[i, j], :]

                # update distance matrix
                X_dist1 = update_mat(X_dist1, i, j, self.linkage)

            a = a.drop([new_clust.iloc[0].name], 0)
            a = a.drop([new_clust.iloc[1].name], 0)

            dim1 = int(new_clust.iloc[0].notna().sum())

            new_cluster_name = "(" + new_clust.iloc[
                0].name + ")" + "-" + "(" + new_clust.iloc[1].name + ")"

            a.loc[new_cluster_name, :] = new_clust.iloc[0].fillna(
                0) + new_clust.iloc[1].shift(dim1, fill_value=0)

            if delay != 0:
                pause_execution(self.delay)

            if self.linkage != "ward":
                self.point_plot_mod_gui(a,
                                        levels[-1],
                                        save_plots=self.save_plots,
                                        ind_fig=index_for_saving_plots)
            else:
                self.point_plot_mod_gui(a,
                                        levels[-2],
                                        levels2[-1],
                                        save_plots=self.save_plots,
                                        ind_fig=index_for_saving_plots)
            index_for_saving_plots += 1
예제 #11
0
    def OPTICS_gui(self, plot=True, plot_reach=False, delay=0):
        """
        Executes the OPTICS algorithm. Similar to DBSCAN, but uses a priority queue.

        :param plot: if True, the scatter plot of the function point_plot is displayed at each step.
        :param plot_reach: if True, the reachability plot is displayed at each step.
        :param delay: seconds for which to delay the algorithm, so that the images displayes in the GUI
                      show at a slower pace.
        :return (ClustDist, CoreDist): ClustDist, a dictionary of the form point_index:reach_dist, and
                 CoreDist, a dictionary of the form point_index:core_dist
        """

        self.ClustDist = {}
        self.CoreDist = {}
        Seed = {}
        processed = []
        index_for_saving_plots = 0

        # create dictionary
        X_dict = dict(zip([str(i) for i in range(len(self.X))], self.X))

        # until all points have been processed
        while len(processed) != len(self.X):

            # if queue is empty take a random point
            if len(Seed) == 0:

                unprocessed = list(set(list(X_dict.keys())) - set(processed))

                (o, r) = (random.choice(unprocessed), np.inf)

                self.clear_seed_log(Seed, o)

            # else take the minimum and delete it from the queue
            else:

                (o, r) = (min(Seed, key=Seed.get), Seed[min(Seed, key=Seed.get)])

                self.clear_seed_log(Seed, o)

                del Seed[o]

                self.clear_seed_log(Seed, o)

            # scan the neighborhood of the point
            N = scan_neigh1(X_dict, X_dict[o], self.eps)

            # update the cluster dictionary and the core distance dictionary
            self.ClustDist.update({o: r})

            self.CoreDist.update({o: minPTSdist(X_dict, o, self.mp, self.eps)})

            if delay != 0:
                pause_execution(delay)

            if plot == True:
                self.point_plot_gui(
                    X_dict,
                    X_dict[o],
                    N,
                    processed,
                    save_plots=self.save_plots,
                    ind_fig=index_for_saving_plots,
                )

                if plot_reach == True:
                    self.reach_plot_gui(
                        X_dict,
                        save_plots=self.save_plots,
                        ind_fig=index_for_saving_plots,
                    )
                    index_for_saving_plots += 1

            # mark o as processed
            processed.append(o)

            # if the point is core
            if len(N) >= self.mp - 1:
                # for each unprocessed point in the neighborhood
                for n in N:

                    if n in processed:

                        continue

                    else:
                        # compute its reach_dist from o
                        p = reach_dist(X_dict, n, o, self.mp, self.eps)

                        # if it is in Seed, update its reach_dist if it is lower
                        if n in Seed:

                            if p < Seed[n]:
                                Seed[n] = p

                                self.clear_seed_log(Seed, o)
                        # else, insert it into the Seed
                        else:

                            Seed.update({n: p})

                            self.clear_seed_log(Seed, o)

        self.start_EXTRACT_OPTICS()