Beispiel #1
0
    def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore, **kwargs):
        if isinstance(input_sample, str):
            sample = read_sample(input_sample)
        else:
            sample = input_sample

        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore, **kwargs)
        xmeans_instance.process()
         
        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()
        wce = xmeans_instance.get_total_wce()
    
        obtained_cluster_sizes = [len(cluster) for cluster in clusters]

        assertion.eq(len(sample), sum(obtained_cluster_sizes))
        assertion.eq(len(clusters), len(centers))
        assertion.le(len(centers), kmax)

        expected_wce = 0.0
        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
        for index_cluster in range(len(clusters)):
            for index_point in clusters[index_cluster]:
                expected_wce += metric(sample[index_point], centers[index_cluster])

        assertion.eq(expected_wce, wce)

        if expected_cluster_length is not None:
            assertion.eq(len(centers), len(expected_cluster_length))

            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()
            
            assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def template_clustering(start_medoids, path, tolerance=0.25, show=True):
    sample = read_sample(path)

    metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)
    kmedoids_instance = kmedoids(sample,
                                 start_medoids,
                                 tolerance,
                                 metric=metric)
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, sample, 0)
        visualizer.append_cluster([sample[index] for index in start_medoids],
                                  marker='*',
                                  markersize=15)
        visualizer.append_cluster(medoids,
                                  data=sample,
                                  marker='*',
                                  markersize=15)
        visualizer.show()

    return sample, clusters
Beispiel #3
0
    def __init__(self,
                 data,
                 initial_centers=None,
                 kmax=20,
                 tolerance=0.001,
                 criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
                 ccore=True,
                 **kwargs):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (array_like): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: `[center1, center2, ...]`,
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation (by default `splitting_type.BAYESIAN_INFORMATION_CRITERION`).
        @param[in] ccore (bool): Defines if C++ pyclustering library should be used instead of Python implementation.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `repeat`, `random_state`, `metric`, `alpha`, `beta`).

        <b>Keyword Args:</b><br>
            - repeat (unit): How many times K-Means should be run to improve parameters (by default is `1`).
               With larger `repeat` values suggesting higher probability of finding global optimum.
            - random_state (int): Seed for random state (by default is `None`, current system time is used).
            - metric (distance_metric): Metric that is used for distance calculation between two points (by default
               euclidean square distance).
            - alpha (double): Parameter distributed [0.0, 1.0] for alpha probabilistic bound \f$Q\left(\alpha\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.
            - beta (double): Parameter distributed [0.0, 1.0] for beta probabilistic bound \f$Q\left(\beta\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.

        """

        self.__pointer_data = numpy.array(data)
        self.__clusters = []
        self.__random_state = kwargs.get('random_state', None)
        self.__metric = copy.copy(
            kwargs.get('metric',
                       distance_metric(type_metric.EUCLIDEAN_SQUARE)))

        if initial_centers is not None:
            self.__centers = numpy.array(initial_centers)
        else:
            self.__centers = kmeans_plusplus_initializer(
                data, 2, random_state=self.__random_state).initialize()

        self.__kmax = kmax
        self.__tolerance = tolerance
        self.__criterion = criterion
        self.__total_wce = 0.0
        self.__repeat = kwargs.get('repeat', 1)
        self.__alpha = kwargs.get('alpha', 0.9)
        self.__beta = kwargs.get('beta', 0.9)

        self.__ccore = ccore and self.__metric.get_type(
        ) != type_metric.USER_DEFINED
        if self.__ccore is True:
            self.__ccore = ccore_library.workable()

        self.__verify_arguments()
Beispiel #4
0
def kmedoids(data):
    x = np.array(data["x"], dtype='float')
    y = np.array(data["y"], dtype='float')

    if len(x) == 0:
        return {'centroids': [], 'points': []}

    k = int(data['k'])

    D = np.array([x, y]).T
    k = min(k, D.shape[0])

    metrics = {
        "manhattan": distance_metric(type_metric.MANHATTAN, data=D),
        "euclidean": distance_metric(type_metric.EUCLIDEAN, data=D),
        "chebyshev": distance_metric(type_metric.CHEBYSHEV, data=D),
        "canberra": distance_metric(type_metric.CANBERRA, data=D),
        "chi-square": distance_metric(type_metric.CHI_SQUARE, data=D)
    }

    metric = metrics[data['metric']]
    kmedoids_instance = kmedoids_(D, list(range(k)), metric=metric)
    kmedoids_instance.process()

    labels = kmedoids_instance.predict(D)
    medoids = np.array(kmedoids_instance.get_medoids())
    medoids = D[medoids]

    output_data = {
        'centroids': [{
            'x': medoids[i, 0],
            'y': medoids[i, 1],
            'label': i
        } for i in range(len(medoids))],
        'points': [{
            'x': D[i, 0],
            'y': D[i, 1],
            'label': int(labels[i])
        } for i in range(len(labels))]
    }

    return output_data
Beispiel #5
0
    def templatePredict(path_to_file, initial_medians, points, expected_closest_clusters, ccore, **kwargs):
        sample = read_sample(path_to_file)

        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        itermax = kwargs.get('itermax', 200)

        kmeans_instance = kmedians(sample, initial_medians, 0.001, ccore, metric=metric, itermax=itermax)
        kmeans_instance.process()

        closest_clusters = kmeans_instance.predict(points)
        assertion.eq(len(expected_closest_clusters), len(closest_clusters))
        assertion.true(numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))
    def test_various_data_type_simple1(self):
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
Beispiel #7
0
    def predict(self, points):
        """!
        @brief Calculates the closest cluster to each point.

        @param[in] points (array_like): Points for which closest clusters are calculated.

        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                 collection if 'process()' method was not called.

        An example how to calculate (or predict) the closest cluster to specified points.
        @code
            from pyclustering.cluster.xmeans import xmeans
            from pyclustering.samples.definitions import SIMPLE_SAMPLES
            from pyclustering.utils import read_sample

            # Load list of points for cluster analysis.
            sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)

            # Initial centers for sample 'Simple3'.
            initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]

            # Create instance of X-Means algorithm with prepared centers.
            xmeans_instance = xmeans(sample, initial_centers)

            # Run cluster analysis.
            xmeans_instance.process()

            # Calculate the closest cluster to following two points.
            points = [[0.25, 0.2], [2.5, 4.0]]
            closest_clusters = xmeans_instance.predict(points)
            print(closest_clusters)
        @endcode

        """
        nppoints = numpy.array(points)
        if len(self.__clusters) == 0:
            return []

        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE,
                                 numpy_usage=True)

        differences = numpy.zeros((len(nppoints), len(self.__centers)))
        for index_point in range(len(nppoints)):
            differences[index_point] = metric(nppoints[index_point],
                                              self.__centers)

        return numpy.argmin(differences, axis=1)
Beispiel #8
0
    def correct_scores(sample_path, answer_path, ccore_flag, **kwargs):
        data_type = kwargs.get('data_type', 'points')

        sample = read_sample(sample_path)
        if data_type == 'distance_matrix':
            sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)

        return scores
Beispiel #9
0
    def template_predict(self, path, amount, points, ccore):
        metric = distance_metric(type_metric.EUCLIDEAN)

        sample = read_sample(path)
        gmeans_instance = gmeans(sample, amount, ccore).process()
        centers = gmeans_instance.get_centers()

        closest_clusters = gmeans_instance.predict(points)

        self.assertEqual(len(points), len(closest_clusters))

        for i in range(len(points)):
            cluster_index = closest_clusters[i]
            distance = metric(centers[cluster_index], points[i])
            for center_index in range(len(centers)):
                if center_index != cluster_index:
                    other_distance = metric(centers[center_index], points[i])
                    self.assertLessEqual(distance, other_distance)
Beispiel #10
0
def template_clustering(start_medoids,
                        path,
                        tolerance=0.25,
                        show=True,
                        **kwargs):
    ccore = kwargs.get('ccore', True)
    data_type = kwargs.get('data_type', 'points')

    original_data = read_sample(path)
    sample = original_data
    if data_type == 'distance_matrix':
        sample = calculate_distance_matrix(sample)

    metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)

    kmedoids_instance = kmedoids(sample,
                                 start_medoids,
                                 tolerance,
                                 metric=metric,
                                 ccore=ccore,
                                 data_type=data_type)
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters()
    print("Iterations:", kmedoids_instance.get_iterations())
    print([len(cluster) for cluster in clusters])
    print(clusters)
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, original_data, 0)
        visualizer.append_cluster(
            [original_data[index] for index in start_medoids],
            marker='*',
            markersize=15)
        visualizer.append_cluster(medoids,
                                  data=original_data,
                                  marker='*',
                                  markersize=15)
        visualizer.show()

    return original_data, clusters
Beispiel #11
0
    def predict(self, points):
        """!
        @brief Calculates the closest cluster to each point.

        @param[in] points (array_like): Points for which closest clusters are calculated.

        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                 collection if 'process()' method was not called.

        """
        nppoints = numpy.array(points)
        if len(self.__clusters) == 0:
            return []

        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)

        npcenters = numpy.array(self.__centers)
        differences = numpy.zeros((len(nppoints), len(npcenters)))
        for index_point in range(len(nppoints)):
            differences[index_point] = metric(nppoints[index_point], npcenters)

        return numpy.argmin(differences, axis=1)
 def test_various_data_type_simple5(self):
     for i in range(10):
         self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
 def test_various_data_type_simple1_euclidean_chebyshev(self):
     self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV))
 def test_various_data_type_simple1_euclidean_manhattan(self):
     self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN))