def template_compare_output(self, path, k, candidates, random_state, metric):
        sample = read_sample(path)
        matrix = calculate_distance_matrix(sample, metric=metric)

        result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True)
        result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True)

        assertion.eq(result1, result2)
def perform_clustering(embs, method='kmeans'):
    """Perform clusering based on specified method."""

    if method == 'kmeans':

        num_speakers = est_num_clusters(embs, max_num=7, init_num=2)
        cluster = KMeans(n_clusters=num_speakers, init='k-means++', n_jobs=-1, random_state=0)
        # cluster.fit_predict(embs)
        # pred_labels = cluster.labels_
        cluster.fit(embs)
        centroids = cluster.cluster_centers_
        pred_labels = np.argmax(cosine_similarity(embs, centroids), axis=1)

    elif method == 'plda_kmeans':

        num_speakers = est_num_clusters(embs, max_num=7, init_num=2)
        cluster = IvecKMeans(np.array(kmeans_plusplus_initializer(embs, num_speakers).initialize()),
                             num_speakers, score_method='plda')
        cluster.fit(embs)
        pred_labels = cluster.old_labels

    elif method == 'cosine_kmeans':

        num_speakers = est_num_clusters(embs, max_num=7, init_num=2)
        cluster = IvecKMeans(np.array(kmeans_plusplus_initializer(embs, num_speakers).initialize()),
                             num_speakers, score_method='cosine')
        cluster.fit(embs)
        pred_labels = cluster.labels()

    elif method == 'rcc':
        cluster = rcc.RccCluster(k=10, measure='cosine', clustering_threshold=1, verbose=False)
        pred_labels = cluster.fit(embs)

    elif method == 'spectral':
        cluster = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', n_neighbors=10, n_jobs=-1)
        cluster.fit_predict(embs)
        pred_labels = cluster.labels_

    elif method == 'spectral_cosine':
        sigma_squared = 0.5
        cosine_dist = 1 - cosine_similarity(embs)
        affinity = np.exp(-np.power(cosine_dist, 2) / sigma_squared)
        if np.isnan(affinity).any() or np.isinf(affinity).any():
            raise ValueError('Affinity matrix contains NaN.')
        norms = np.linalg.norm(embs, axis=1)
        print(np.max(norms), np.min(norms))
        print(np.max(affinity), np.min(affinity))
        cluster = SpectralClustering(n_clusters=2, affinity='precomputed', n_jobs=-1)
        cluster.fit(affinity)
        pred_labels = cluster.labels_

    else:
        raise ValueError('Clustering method not defined.')

    return pred_labels
    def get_modelo(self, algoritmo, eps, neig):
        print(algoritmo + ' ' + str(eps) + ' - ' + str(neig))
        instance = None

        if algoritmo == 'AGNES':
            instance = agglomerative(self.amostras,
                                     self.numero_clusters,
                                     link=None)
        elif algoritmo == 'BIRCH':
            instance = birch(self.amostras,
                             self.numero_clusters,
                             entry_size_limit=10000)
        elif algoritmo == 'CLARANS':
            instance = clarans(self.amostras,
                               self.numero_clusters,
                               numlocal=100,
                               maxneighbor=1)
        elif algoritmo == 'CURE':
            instance = cure(self.amostras,
                            self.numero_clusters,
                            number_represent_points=5,
                            compression=0.5)
        elif algoritmo == 'DBSCAN':
            instance = dbscan(self.amostras, eps=eps, neighbors=neig)
        elif algoritmo == 'FCM':
            initial_centers = kmeans_plusplus_initializer(
                self.amostras, self.numero_clusters).initialize()
            instance = fcm(self.amostras, initial_centers)
        elif algoritmo == 'KMEANS':
            initial_centers = kmeans_plusplus_initializer(
                self.amostras, self.numero_clusters).initialize()
            instance = kmeans(self.amostras, initial_centers, tolerance=0.001)
        elif algoritmo == 'KMEDOIDS':
            instance = kmedoids(self.amostras,
                                initial_index_medoids=[0, 0, 0, 0, 0, 0, 0],
                                tolerance=0.0001)  #ajustar o n_de cluster
        elif algoritmo == 'OPTICS':
            instance = optics(self.amostras, eps=eps, minpts=neig)
        elif algoritmo == 'ROCK':
            instance = rock(self.amostras,
                            eps=eps,
                            number_clusters=self.numero_clusters,
                            threshold=0.5)
        else:
            pass

        instance.process()
        lista_agrupada = self.get_lista_agrupada(instance.get_clusters())
        lista_agrupada = np.array(lista_agrupada)

        if (neig != 0):
            n_grupos = len(np.unique(lista_agrupada))
            if n_grupos > self.numero_clusters:
                lista_agrupada = self.get_modelo(algoritmo, eps, neig + 1)
        return lista_agrupada
    def xmeans_cluster(self, domain_features):
        final_centers = None
        final_radiuses = None
        final_clusters = None
        for i in range(5):
            initial_centers = kmeans_plusplus_initializer(domain_features,
                                                          2).initialize()
            # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
            max_num = int(len(domain_features) / 2)
            xmeans_instance = xmeans(domain_features, initial_centers, max_num)
            xmeans_instance.process()
            centers = xmeans_instance.get_centers()
            flag = False
            if i == 0 or len(centers) > len(final_centers):
                flag = True
            if flag:
                radiuses = []
                cluster_num = 1
                for cluster in xmeans_instance.get_clusters():
                    cluster_num = cluster_num + 1
                    radius_total = 0.0
                    for i in cluster:
                        dist = np.linalg.norm(domain_features[i] -
                                              centers[cluster_num - 2])
                        radius_total += dist
                    radiuses.append(radius_total / len(cluster))
                final_centers = xmeans_instance.get_centers()
                final_radiuses = radiuses
                final_clusters = xmeans_instance.get_clusters()

        return final_centers, final_radiuses, final_clusters
Exemple #5
0
    def _search_optimal_parameters(self, data, amount):
        """!
        @brief Performs cluster analysis for specified data several times to find optimal clustering result in line
                with WCE.

        @param[in] data (array_like): Input data that should be clustered.
        @param[in] amount (unit): Amount of clusters that should be allocated.

        @return (tuple) Optimal clustering result: (clusters, centers, wce).

        """
        best_wce, best_clusters, best_centers = float('+inf'), [], []
        for _ in range(self.__repeat):
            initial_centers = kmeans_plusplus_initializer(
                data, amount, random_state=self.__random_state).initialize()
            solver = kmeans(data,
                            initial_centers,
                            tolerance=self.__tolerance,
                            ccore=False).process()

            candidate_wce = solver.get_total_wce()
            if candidate_wce < best_wce:
                best_wce = candidate_wce
                best_clusters = solver.get_clusters()
                best_centers = solver.get_centers()

            if len(initial_centers) == 1:
                break  # No need to rerun clustering for one initial center.

        return best_clusters, best_centers, best_wce
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs):
    initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
    sample = read_sample(sample_file_path)

    elbow_instance = elbow(sample, kmin, kmax, initializer=initializer)
    elbow_instance.process()

    amount_clusters = elbow_instance.get_amount()
    wce = elbow_instance.get_wce()

    centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
    kmeans_instance = kmeans(sample, centers)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()

    print("Sample '%s': Obtained amount of clusters: '%d'." %
          (sample_file_path, amount_clusters))

    figure = plt.figure(1)
    ax = figure.add_subplot(111)
    ax.plot(range(kmin, kmax), wce, color='b', marker='.')
    ax.plot(amount_clusters,
            wce[amount_clusters - kmin],
            color='r',
            marker='.',
            markersize=10)
    ax.annotate("Elbow",
                (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5))
    ax.grid(True)
    plt.ylabel("WCE")
    plt.xlabel("K")
    plt.show()

    kmeans_visualizer.show_clusters(sample, clusters, centers)
Exemple #7
0
    def __init__(self,
                 data,
                 initial_centers=None,
                 kmax=20,
                 tolerance=0.001,
                 criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
                 ccore=True,
                 **kwargs):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (array_like): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: `[center1, center2, ...]`,
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation (by default `splitting_type.BAYESIAN_INFORMATION_CRITERION`).
        @param[in] ccore (bool): Defines if C++ pyclustering library should be used instead of Python implementation.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `repeat`, `random_state`, `metric`, `alpha`, `beta`).

        <b>Keyword Args:</b><br>
            - repeat (unit): How many times K-Means should be run to improve parameters (by default is `1`).
               With larger `repeat` values suggesting higher probability of finding global optimum.
            - random_state (int): Seed for random state (by default is `None`, current system time is used).
            - metric (distance_metric): Metric that is used for distance calculation between two points (by default
               euclidean square distance).
            - alpha (double): Parameter distributed [0.0, 1.0] for alpha probabilistic bound \f$Q\left(\alpha\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.
            - beta (double): Parameter distributed [0.0, 1.0] for beta probabilistic bound \f$Q\left(\beta\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.

        """

        self.__pointer_data = numpy.array(data)
        self.__clusters = []
        self.__random_state = kwargs.get('random_state', None)
        self.__metric = copy.copy(
            kwargs.get('metric',
                       distance_metric(type_metric.EUCLIDEAN_SQUARE)))

        if initial_centers is not None:
            self.__centers = numpy.array(initial_centers)
        else:
            self.__centers = kmeans_plusplus_initializer(
                data, 2, random_state=self.__random_state).initialize()

        self.__kmax = kmax
        self.__tolerance = tolerance
        self.__criterion = criterion
        self.__total_wce = 0.0
        self.__repeat = kwargs.get('repeat', 1)
        self.__alpha = kwargs.get('alpha', 0.9)
        self.__beta = kwargs.get('beta', 0.9)

        self.__ccore = ccore and self.__metric.get_type(
        ) != type_metric.USER_DEFINED
        if self.__ccore is True:
            self.__ccore = ccore_library.workable()

        self.__verify_arguments()
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs):
    initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
    sample = read_sample(sample_file_path)

    elbow_instance = elbow(sample, kmin, kmax, initializer=initializer)
    elbow_instance.process()

    amount_clusters = elbow_instance.get_amount()
    wce = elbow_instance.get_wce()

    centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
    kmeans_instance = kmeans(sample, centers)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()

    print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters))

    figure = plt.figure(1)
    ax = figure.add_subplot(111)
    ax.plot(range(kmin, kmax), wce, color='b', marker='.')
    ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10)
    ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5))
    ax.grid(True)
    plt.ylabel("WCE")
    plt.xlabel("K")
    plt.show()

    kmeans_visualizer.show_clusters(sample, clusters, centers)
def subcluster(dataset):
    kmin = 1
    kmax = 20
    if kmax > len(dataset):
        kmax = len(dataset)
    optimal_clusters = 1
    # Determining Clusters
    # Might potentially be inefficient technique
    # Instead of elbow, could again repeat what is done
    # in the main clustering, going through K values
    # Choosing one with lowest error from calcError
    # This could be very time intensive however
    if kmax - kmin <= 3:
        optimal_clusters = int((kmin + kmax) / 2)
    else:
        elbow_inst = elbow(dataset, kmin, kmax)
        elbow_inst.process()
        optimal_clusters = elbow_inst.get_amount()
    if optimal_clusters > len(dataset):
        optimal_clusters = len(dataset)
    initial_centers = kmeans_plusplus_initializer(
        dataset, optimal_clusters).initialize()
    metric = distance_metric(type_metric.EUCLIDEAN)
    kmeans_instance = kmeans(dataset, initial_centers, metric=metric)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()

    return clusters
    def templateKmeasPlusPlusCenterInitializer(self, data, amount):
        centers = kmeans_plusplus_initializer(data, amount).initialize()

        self.assertEqual(amount, len(centers))
        self.assertEqual(len(data[0]), len(centers[0]))

        return centers
Exemple #11
0
    def __search_optimial_parameters(self, local_data):
        """!
        @brief Split data of the region into two cluster and tries to find global optimum by running k-means clustering
                several times (defined by 'repeat' argument).

        @param[in] local_data (list): Points of a region that should be split into two clusters.

        @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce).

        """
        optimal_wce, optimal_centers, optimal_clusters = float(
            '+inf'), None, None

        for _ in range(self.__repeat):
            candidates = 5
            if len(local_data) < candidates:
                candidates = len(local_data)

            local_centers = kmeans_plusplus_initializer(
                local_data, 2, candidates).initialize()

            kmeans_instance = kmeans(local_data,
                                     local_centers,
                                     tolerance=self.__tolerance,
                                     ccore=False)
            kmeans_instance.process()

            local_wce = kmeans_instance.get_total_wce()
            if local_wce < optimal_wce:
                optimal_centers = kmeans_instance.get_centers()
                optimal_clusters = kmeans_instance.get_clusters()
                optimal_wce = local_wce

        return optimal_clusters, optimal_centers, optimal_wce
Exemple #12
0
    def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates):
        sample = read_sample(path_sample)

        attempts = 10
        for _ in range(attempts):
            medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
            medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
            medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)

            unique_medoids = set(medoids)
            if len(unique_medoids) != len(medoids):
                continue

            return

        self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts)
def cluster_xMean_FixSize(binsList, amount_initial_centers=3, kmax=10):
    sample = []
    means = []
    vars = []
    slopes = []
    for i, bin in enumerate(binsList):
        sample.append(
            [bin.get_representation(),
             bin.get_variance(),
             bin.get_slope()])
        means.append(bin.get_representation())
        vars.append(bin.get_variance())
        slopes.append(bin.get_slope())

    # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
    # start analysis.
    initial_centers = kmeans_plusplus_initializer(
        sample, amount_initial_centers).initialize()
    # # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
    # # number of clusters that can be allocated is 20.
    xmeans_instance = xmeans(sample, initial_centers, kmax)
    xmeans_instance.process()
    # # Extract clustering results: clusters and their centers
    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    print(len(clusters))

    return {"clusters": clusters, "centers": centers}
Exemple #14
0
    def get_kmeans_clusters(data, count_centers):
        rows = data.getRows()
        input_data = list()
        result_clusters = list()
        for row in rows:
            input_data.append(row.getDataArray())
        SST = calculate_sst(input_data)

        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(
            input_data, count_centers).initialize()
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(input_data, initial_centers)
        # run cluster analysis and obtain results
        kmeans_instance.process()
        clusters = kmeans_instance.get_clusters()
        colorRange = Constants.DEFAULT_COLOR_SET
        SSB = 0
        SSW = 0
        for i, cluster in enumerate(clusters):
            result_cluster = Cluster(
                KMeansWindow.get_rows_kmeans(data, cluster))
            ro = KMeansWindow.get_rows_kmeans(data, cluster)
            f = [x._dataArray for x in ro]
            SSW = SSW + calculate_ssw(f)
            colour = random.choice(colorRange)
            result_cluster.setName(colour)
            result_cluster.setColor(colour)
            result_clusters.append(result_cluster)
        SSB = calculate_ssb(SST, SSW)
        RS_RESULT.append(SSB / SST)

        print(RS_RESULT)
        return result_clusters
Exemple #15
0
    def __improve_parameters(self, centers, available_indexes=None):
        """!
        @brief Performs k-means clustering in the specified region.
        
        @param[in] centers (list): Cluster centers, if None then automatically generated two centers using center initialization method.
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None then all points are used.
        
        @return (tuple) List of allocated clusters, list of centers and total WCE.
        
        """

        if available_indexes and len(available_indexes) == 1:
            index_center = available_indexes[0]
            return [available_indexes], self.__pointer_data[index_center], 0.0

        local_data = self.__pointer_data
        if available_indexes:
            local_data = [self.__pointer_data[i] for i in available_indexes]

        local_centers = centers
        if centers is None:
            local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()

        kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
        kmeans_instance.process()

        local_wce = kmeans_instance.get_total_wce()
        local_centers = kmeans_instance.get_centers()
        
        clusters = kmeans_instance.get_clusters()
        if available_indexes:
            clusters = self.__local_to_global_clusters(clusters, available_indexes)
        
        return clusters, local_centers, local_wce
Exemple #16
0
def calculate_fitness(ind, df, X, target, func_set, operations,
                      number_of_clusters):
    ind_exp = ind.unroll_expression([])

    def fitness_distance(data1, data2):
        """
        input:
            point1 e point2 = pontos utilizados no cálculo da distância
        output:
            result = distância entre os dois pontos
        """
        result = eval(data1, data2, func_set, operations, ind_exp)
        return result

    # distance function
    fitness_metric = distance_metric(type_metric.USER_DEFINED,
                                     func=fitness_distance)

    k = number_of_clusters

    initial_centers = kmeans_plusplus_initializer(X, k).initialize()
    kmeans_instance = kmeans(X, initial_centers, metric=fitness_metric)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()

    for i in range(len(clusters)):
        df.loc[clusters[i], 'y_pred'] = i

    score = v_measure_score(target, df.y_pred)
    # reseting dataframe
    df = df.drop(['y_pred'], axis=1)

    return score
Exemple #17
0
    def __improve_parameters(self, centers, available_indexes = None):
        """!
        @brief Performs k-means clustering in the specified region.
        
        @param[in] centers (list): Centers of clusters.
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        """

        if available_indexes and len(available_indexes) == 1:
            index_center = available_indexes[0]
            return [ available_indexes ], self.__pointer_data[index_center]

        local_data = self.__pointer_data
        if available_indexes:
            local_data = [ self.__pointer_data[i] for i in available_indexes ]

        local_centers = centers
        if centers is None:
            local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()

        kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False)
        kmeans_instance.process()

        local_centers = kmeans_instance.get_centers()
        
        clusters = kmeans_instance.get_clusters()
        if available_indexes:
            clusters = self.__local_to_global_clusters(clusters, available_indexes)
        
        return clusters, local_centers
Exemple #18
0
    def templateKmeansPlusPlusUnique(self, path_sample, amount, candidates):
        sample = read_sample(path_sample)
        start_medoids = kmeans_plusplus_initializer(
            sample, amount, candidates).initialize(return_index=True)

        unique_mediods = set(start_medoids)
        self.assertEqual(len(unique_mediods), len(start_medoids))
Exemple #19
0
def x_means(X, num_init_clusters=8, visualize=True):
    from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
    from pyclustering.cluster.xmeans import xmeans
    from pyclustering.cluster import cluster_visualizer
    from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
    from pyclustering.cluster import cluster_visualizer_multidim

    X = list(X)

    start_centers = kmeans_plusplus_initializer(
        X, num_init_clusters).initialize()

    xmeans_instance = xmeans(X, start_centers, 32, ccore=True, criterion=0)

    # Run cluster analysis and obtain results.
    xmeans_instance.process()
    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    print('Number of cluster centers calculated :', len(centers))

    if visualize:
        visualizer = cluster_visualizer_multidim()
        visualizer.append_clusters(clusters, X)
        visualizer.show()
    return centers, clusters
Exemple #20
0
def cl_xmeans(sample):
    initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
    xmeans_instance = xmeans(sample, initial_centers, 20)
    xmeans_instance.process()
    return xmeans_instance.get_clusters()


# slc: single linkage clustering
Exemple #21
0
def consensus_clustering(input_path,CLUSTERING_PATH):
    df = pd.read_csv(CLUSTERING_PATH+"distance_matrix.csv", delimiter=',', header=None)

    #read all the values
    sample = df.values
    #number of elements
    N= len(df.columns)
    print(isinstance(np.asmatrix(sample),np.matrix))
    #rule of thumbs for k
    df1= pd.DataFrame(columns=['value'],index=['k_sqrtNBy4','k_sqrtNDiv4','k_sqrtNDiv2','k_sqrtNBy2','k_sqrtN',])
    #df1.at['k_1','value']= 1
    df1.at['k_sqrtN','value']= round(sqrt(N),0)
    df1.at['k_sqrtNDiv2', 'value'] = round(sqrt(N / 2),0)
    df1.at['k_sqrtNBy2', 'value'] = round(sqrt(N * 2),0)
    df1.at['k_sqrtNDiv4', 'value'] = round(sqrt(N / 4),0)
    df1.at['k_sqrtNBy4', 'value'] = round(sqrt(N*4),0)

    # Declare the weight of each vote
    # consensus matrix is NxN
    #initialization
    iterations=20
    weight1 = 1 / iterations
    weight2 = 1 / len(df1.index)#the amount of  k values used
    consensus_matrix = np.zeros((N, N))

    for k in df1.index:
        #run the same algorithm using several k values. Each configuration is run #iterations times.
        for iteration in range(iterations):
            k_value=int(df1.loc[k].values[0])
            initial_medoids = kmeans_plusplus_initializer(sample,k_value).initialize(return_index=True)
            kmedoids_instance = kmedoids(np.asmatrix(sample), initial_medoids,data_type="distance_matrix")
            kmedoids_instance.process()
            clusters = kmedoids_instance.get_clusters()
            coassociations_matrix= np.zeros((N, N))
            for cluster in clusters:
                for crypto in cluster:
                    #set the diagonal elements with value 1
                    coassociations_matrix[crypto][crypto] = 1
                    for crypto1 in cluster:
                        coassociations_matrix[crypto][crypto1]= 1
                        coassociations_matrix[crypto1][crypto] = 1
            #sum the two matrices
            consensus_matrix=consensus_matrix+coassociations_matrix
    consensus_matrix = consensus_matrix*weight1*weight2
    #now, by doing (1 - consensus_matrix) we get the dissimilarity/distance matrix
    distance_matrix= 1-consensus_matrix
    df = pd.DataFrame(data=distance_matrix)
    df.to_csv(CLUSTERING_PATH+"consensus_matrix(distance).csv",sep=",")

    #Hierarchical clustering
    for k in df1.index:
        k_value = int(df1.loc[k].values[0])

        agglomerative_instance = agglomerative(distance_matrix,k_value, type_link.AVERAGE_LINK)
        agglomerative_instance.process()
        # Obtain results of clustering
        clusters = agglomerative_instance.get_clusters()
        save_clusters(input_path,clusters,k,CLUSTERING_PATH)
Exemple #22
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids,
                                        expected_cluster_length, metric,
                                        ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(
                    sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data,
                                         initial_medoids,
                                         0.025,
                                         ccore_flag,
                                         metric=metric,
                                         data_type=data_type)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
    def xmeansRoutine(self):

        self.initial_centers = kmeans_plusplus_initializer(
            self.datalist, self.amount_initial_centers).initialize()
        self.xmeans_instance = xmeans(self.datalist, self.initial_centers,
                                      self.amount_max_centers)
        self.xmeans_instance.process()
        self.clusters = self.xmeans_instance.get_clusters()
        self.centers = self.xmeans_instance.get_centers()
    def templateKmeasPlusPlusCenterInitializer(self, data, amount):
        centers = kmeans_plusplus_initializer(data, amount).initialize()

        assertion.eq(amount, len(centers))

        for center in centers:
            assertion.eq(len(data[0]), len(center))

        return centers
    def templateKmeasPlusPlusCenterInitializer(self, data, amount):
        centers = kmeans_plusplus_initializer(data, amount).initialize()

        assertion.eq(amount, len(centers))

        for center in centers:
            assertion.eq(len(data[0]), len(center))

        return centers
def clusterData(data, numberOfConversations, getData):
    initial_centers = kmeans_plusplus_initializer(data, numberOfConversations).initialize()
    instance = kmeans(data, initial_centers)
    instance.process()
    #kmeans_visualizer.show_clusters(data, instance.get_clusters(), instance.get_centers(), initial_centers)
    if getData:  # returns list that specifies which messages belong to which conversation
        return instance.get_clusters()
    else:        # returns time list of when each conversation occured
        return instance.get_centers()
    def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
    def templateKmeasPlusPlusCenterInitializerIndexReturn(self, data, amount):
        centers = kmeans_plusplus_initializer(data, amount).initialize(return_index=True)

        assertion.eq(amount, len(centers))

        for center_index in centers:
            assertion.gt(len(data), center_index)
            assertion.le(0, center_index)

        return centers
Exemple #29
0
    def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
        data = read_sample(data_file)
        reader = answer_reader(answer_file)

        amount_medoids = len(reader.get_clusters())

        initial_medoids = kmeans_plusplus_initializer(
            data, amount_medoids, **kwargs).initialize(return_index=True)
        kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore,
                                     **kwargs)

        kmedoids_instance.process()

        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        expected_length_clusters = sorted(reader.get_cluster_lengths())

        assertion.eq(len(expected_length_clusters), len(medoids))
        assertion.eq(len(data), sum([len(cluster) for cluster in clusters]))
        assertion.eq(sum(expected_length_clusters),
                     sum([len(cluster) for cluster in clusters]))

        unique_medoids = set()
        for medoid in medoids:
            assertion.false(
                medoid in unique_medoids,
                message="Medoids '%s' is not unique (actual medoids: '%s')" %
                (str(medoid), str(unique_medoids)))
            unique_medoids.add(medoid)

        unique_points = set()
        for cluster in clusters:
            for point in cluster:
                assertion.false(
                    point in unique_points,
                    message=
                    "Point '%s' is already assigned to one of the clusters." %
                    str(point))
                unique_points.add(point)

        assertion.eq(expected_length_clusters,
                     sorted([len(cluster) for cluster in clusters]))

        expected_clusters = reader.get_clusters()
        for actual_cluster in clusters:
            cluster_found = False
            for expected_cluster in expected_clusters:
                if actual_cluster == expected_cluster:
                    cluster_found = True

            assertion.true(
                cluster_found,
                message="Actual cluster '%s' is not found among expected." %
                str(actual_cluster))
Exemple #30
0
    def __run_feature_xmeans(self, features, num_init_centers = 10, max_centers = 30, \
        clust_size_threshold = 1, dist_threshold = 10) -> list:

        # run xmeans algorithm
        initial_centers = kmeans_plusplus_initializer(
            features, num_init_centers).initialize()
        algo = xmeans(features,
                      initial_centers=initial_centers,
                      kmax=max_centers)
        algo.process()
        centroids, clusters = algo.get_centers(), algo.get_clusters()

        # pre-process centroids
        p_centroids = []
        for coord in centroids:
            row, col = coord[0], coord[1]
            p_centroids.append((int(round(row)), int(round(col))))

        # determine close centroids
        comb_indices = set()
        for comb in itertools.combinations(range(len(p_centroids)), 2):
            cen, c_cen = p_centroids[comb[0]], p_centroids[comb[1]]
            dist = math.sqrt((cen[0] - c_cen[0])**2 + (cen[1] - c_cen[1])**2)
            if dist <= dist_threshold: comb_indices.add(frozenset(comb))

        # find transitive centroid clusters
        trans_centroids = []
        for comb in comb_indices:
            addedFlag = False
            for i in range(len(trans_centroids)):
                if len(trans_centroids[i].intersection(comb)):
                    trans_centroids[i] = trans_centroids[i].union(comb)
                    addedFlag = True
                    break
            if not addedFlag: trans_centroids.append(frozenset(comb))

        # combine close transitive centroids sets
        c_centroids, added_indices = [], set()
        for combs in trans_centroids:
            n_centroid = [0, 0]
            for c_idx in combs:
                added_indices.add(c_idx)
                n_centroid[0] += centroids[c_idx][0]
                n_centroid[1] += centroids[c_idx][1]
            n_centroid[0] /= len(combs)
            n_centroid[1] /= len(combs)
            c_centroids.append(n_centroid)

        # purge under-sized clusters
        for c_idx in range(len(centroids)):
            if c_idx in added_indices or len(clusters[c_idx]) \
                <= clust_size_threshold:
                continue
            c_centroids.append(centroids[c_idx])
        return c_centroids
def est_num_clusters(embs, max_num, init_num):
    """Use xmeans to estimate number of speakers."""

    embs_list = embs.tolist()
    initial_centers = kmeans_plusplus_initializer(embs_list, init_num).initialize()
    xm = xmeans(embs_list, initial_centers, kmax=max_num, ccore=True)
    xm.process()
    num_speakers = len(xm.get_clusters())
    print('Estimated number of speakers: ' + str(num_speakers))

    return num_speakers
Exemple #32
0
 def templateDrawClustersNoFailure(self, data_path, amount_clusters):
     sample = read_sample(data_path);
     
     initial_centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize();
     kmeans_instance = kmeans(sample, initial_centers, amount_clusters);
     
     kmeans_instance.process();
     clusters = kmeans_instance.get_clusters();
     
     ax = draw_clusters(sample, clusters);
     assert None != ax;
    def templateKmeasPlusPlusCenterInitializerIndexReturn(self, data, amount):
        centers = kmeans_plusplus_initializer(data, amount).initialize(return_index=True)

        assertion.eq(amount, len(centers))

        for center_index in centers:
            assertion.gt(len(data), center_index)
            assertion.le(0, center_index)
            assertion.eq(1, centers.count(center_index))

        return centers
Exemple #34
0
def template_kmeans_plusplus_initializer(path, amount, draw=True):
    sample = read_sample(path)
    centers = kmeans_plusplus_initializer(sample, amount, 1).initialize()

    if draw is True:
        visualizer = cluster_visualizer()
        visualizer.append_cluster(sample)
        visualizer.append_cluster(centers, marker='*', markersize=10)
        visualizer.show()

    return sample, centers
Exemple #35
0
def template_segmentation_image_amount_colors(source, amount):
    data = read_image(source)

    centers = kmeans_plusplus_initializer(
        data, amount,
        kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
    kmeans_instance = kmeans(data, centers)
    kmeans_instance.process()

    clusters = kmeans_instance.get_clusters()
    draw_image_mask_segments(source, clusters)
def template_kmeans_plusplus_initializer(path, amount, draw = True):
    sample = read_sample(path);
    centers = kmeans_plusplus_initializer(sample, amount).initialize();
    
    if (draw is True):
        visualizer = cluster_visualizer();
        visualizer.append_cluster(sample);
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
        visualizer.show();
    
    return (sample, centers);
    def templateKmeansPlusPlusForKmedoidsClustering(self, path_sample, amount, expected_clusters_length):
        result_success = True
        for _ in range(3):
            try:
                sample = read_sample(path_sample)
                start_medoids = kmeans_plusplus_initializer(sample, amount).initialize(return_index=True)
                KmedoidsTestTemplates.templateLengthProcessData(path_sample, start_medoids, expected_clusters_length,
                                                              False)

            except AssertionError:
                continue
            break

        assert result_success == True;
def find_optimal_amout_clusters(sample_path, kmin, kmax, algorithm):
    sample = read_sample(sample_path)
    search_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm).process()

    amount = search_instance.get_amount()
    scores = search_instance.get_scores()

    print("Sample: '%s', Scores: '%s'" % (sample_path, str(scores)))

    initial_centers = kmeans_plusplus_initializer(sample, amount).initialize()
    kmeans_instance = kmeans(sample, initial_centers).process()

    clusters = kmeans_instance.get_clusters()

    visualizer = cluster_visualizer()
    visualizer.append_clusters(clusters, sample)
    visualizer.show()
Exemple #39
0
 def __initialize_kmeans(self):
     initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize()
     kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True)
     kmeans_instance.process()
     
     means = kmeans_instance.get_centers()
     
     covariances = []
     initial_clusters = kmeans_instance.get_clusters()
     for initial_cluster in initial_clusters:
         if len(initial_cluster) > 1:
             cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ]
             covariances.append(numpy.cov(cluster_sample, rowvar = False))
         else:
             dimension = len(self.__sample[0])
             covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0)
     
     return means, covariances
def cluster_iris():
    start_centers = kmeans_plusplus_initializer(read_sample(FAMOUS_SAMPLES.SAMPLE_IRIS), 4).initialize()
    template_clustering(start_centers, FAMOUS_SAMPLES.SAMPLE_IRIS)