Python distance_metric Beispiele, pyclustering.utils.distance_metric Python Beispiele

Beispiel #1

0

Datei anzeigen

    def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore, **kwargs):
        if isinstance(input_sample, str):
            sample = read_sample(input_sample)
        else:
            sample = input_sample

        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore, **kwargs)
        xmeans_instance.process()
         
        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()
        wce = xmeans_instance.get_total_wce()
    
        obtained_cluster_sizes = [len(cluster) for cluster in clusters]

        assertion.eq(len(sample), sum(obtained_cluster_sizes))
        assertion.eq(len(clusters), len(centers))
        assertion.le(len(centers), kmax)

        expected_wce = 0.0
        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
        for index_cluster in range(len(clusters)):
            for index_point in clusters[index_cluster]:
                expected_wce += metric(sample[index_point], centers[index_cluster])

        assertion.eq(expected_wce, wce)

        if expected_cluster_length is not None:
            assertion.eq(len(centers), len(expected_cluster_length))

            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()
            
            assertion.eq(obtained_cluster_sizes, expected_cluster_length)

Beispiel #2

0

Datei anzeigen

Datei: kmedoids_examples.py Projekt: wuhongjian139/pyclustering

def template_clustering(start_medoids, path, tolerance=0.25, show=True):
    sample = read_sample(path)

    metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)
    kmedoids_instance = kmedoids(sample,
                                 start_medoids,
                                 tolerance,
                                 metric=metric)
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, sample, 0)
        visualizer.append_cluster([sample[index] for index in start_medoids],
                                  marker='*',
                                  markersize=15)
        visualizer.append_cluster(medoids,
                                  data=sample,
                                  marker='*',
                                  markersize=15)
        visualizer.show()

    return sample, clusters

Beispiel #3

0

Datei anzeigen

Datei: xmeans.py Projekt: tjufan/pyclustering

    def __init__(self,
                 data,
                 initial_centers=None,
                 kmax=20,
                 tolerance=0.001,
                 criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
                 ccore=True,
                 **kwargs):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (array_like): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: `[center1, center2, ...]`,
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation (by default `splitting_type.BAYESIAN_INFORMATION_CRITERION`).
        @param[in] ccore (bool): Defines if C++ pyclustering library should be used instead of Python implementation.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `repeat`, `random_state`, `metric`, `alpha`, `beta`).

        <b>Keyword Args:</b><br>
            - repeat (unit): How many times K-Means should be run to improve parameters (by default is `1`).
               With larger `repeat` values suggesting higher probability of finding global optimum.
            - random_state (int): Seed for random state (by default is `None`, current system time is used).
            - metric (distance_metric): Metric that is used for distance calculation between two points (by default
               euclidean square distance).
            - alpha (double): Parameter distributed [0.0, 1.0] for alpha probabilistic bound \f$Q\left(\alpha\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.
            - beta (double): Parameter distributed [0.0, 1.0] for beta probabilistic bound \f$Q\left(\beta\right)\f$.
               The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored.

        """

        self.__pointer_data = numpy.array(data)
        self.__clusters = []
        self.__random_state = kwargs.get('random_state', None)
        self.__metric = copy.copy(
            kwargs.get('metric',
                       distance_metric(type_metric.EUCLIDEAN_SQUARE)))

        if initial_centers is not None:
            self.__centers = numpy.array(initial_centers)
        else:
            self.__centers = kmeans_plusplus_initializer(
                data, 2, random_state=self.__random_state).initialize()

        self.__kmax = kmax
        self.__tolerance = tolerance
        self.__criterion = criterion
        self.__total_wce = 0.0
        self.__repeat = kwargs.get('repeat', 1)
        self.__alpha = kwargs.get('alpha', 0.9)
        self.__beta = kwargs.get('beta', 0.9)

        self.__ccore = ccore and self.__metric.get_type(
        ) != type_metric.USER_DEFINED
        if self.__ccore is True:
            self.__ccore = ccore_library.workable()

        self.__verify_arguments()

Beispiel #4

0

Datei anzeigen

def kmedoids(data):
    x = np.array(data["x"], dtype='float')
    y = np.array(data["y"], dtype='float')

    if len(x) == 0:
        return {'centroids': [], 'points': []}

    k = int(data['k'])

    D = np.array([x, y]).T
    k = min(k, D.shape[0])

    metrics = {
        "manhattan": distance_metric(type_metric.MANHATTAN, data=D),
        "euclidean": distance_metric(type_metric.EUCLIDEAN, data=D),
        "chebyshev": distance_metric(type_metric.CHEBYSHEV, data=D),
        "canberra": distance_metric(type_metric.CANBERRA, data=D),
        "chi-square": distance_metric(type_metric.CHI_SQUARE, data=D)
    }

    metric = metrics[data['metric']]
    kmedoids_instance = kmedoids_(D, list(range(k)), metric=metric)
    kmedoids_instance.process()

    labels = kmedoids_instance.predict(D)
    medoids = np.array(kmedoids_instance.get_medoids())
    medoids = D[medoids]

    output_data = {
        'centroids': [{
            'x': medoids[i, 0],
            'y': medoids[i, 1],
            'label': i
        } for i in range(len(medoids))],
        'points': [{
            'x': D[i, 0],
            'y': D[i, 1],
            'label': int(labels[i])
        } for i in range(len(labels))]
    }

    return output_data

Beispiel #5

0

Datei anzeigen

    def templatePredict(path_to_file, initial_medians, points, expected_closest_clusters, ccore, **kwargs):
        sample = read_sample(path_to_file)

        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        itermax = kwargs.get('itermax', 200)

        kmeans_instance = kmedians(sample, initial_medians, 0.001, ccore, metric=metric, itermax=itermax)
        kmeans_instance.process()

        closest_clusters = kmeans_instance.predict(points)
        assertion.eq(len(expected_closest_clusters), len(closest_clusters))
        assertion.true(numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))

Beispiel #6

0

Datei anzeigen

Datei: ut_center_initializer.py Projekt: zzp-seeker/pyclustering

    def test_various_data_type_simple1(self):
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

Beispiel #7

0

Datei anzeigen

Datei: xmeans.py Projekt: Jianjun-Liu/pyclustering

    def predict(self, points):
        """!
        @brief Calculates the closest cluster to each point.

        @param[in] points (array_like): Points for which closest clusters are calculated.

        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                 collection if 'process()' method was not called.

        An example how to calculate (or predict) the closest cluster to specified points.
        @code
            from pyclustering.cluster.xmeans import xmeans
            from pyclustering.samples.definitions import SIMPLE_SAMPLES
            from pyclustering.utils import read_sample

            # Load list of points for cluster analysis.
            sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)

            # Initial centers for sample 'Simple3'.
            initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]

            # Create instance of X-Means algorithm with prepared centers.
            xmeans_instance = xmeans(sample, initial_centers)

            # Run cluster analysis.
            xmeans_instance.process()

            # Calculate the closest cluster to following two points.
            points = [[0.25, 0.2], [2.5, 4.0]]
            closest_clusters = xmeans_instance.predict(points)
            print(closest_clusters)
        @endcode

        """
        nppoints = numpy.array(points)
        if len(self.__clusters) == 0:
            return []

        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE,
                                 numpy_usage=True)

        differences = numpy.zeros((len(nppoints), len(self.__centers)))
        for index_point in range(len(nppoints)):
            differences[index_point] = metric(nppoints[index_point],
                                              self.__centers)

        return numpy.argmin(differences, axis=1)

Beispiel #8

0

Datei anzeigen

    def correct_scores(sample_path, answer_path, ccore_flag, **kwargs):
        data_type = kwargs.get('data_type', 'points')

        sample = read_sample(sample_path)
        if data_type == 'distance_matrix':
            sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)

        return scores

Beispiel #9

0

Datei anzeigen

Datei: ut_gmeans.py Projekt: zzp-seeker/pyclustering

    def template_predict(self, path, amount, points, ccore):
        metric = distance_metric(type_metric.EUCLIDEAN)

        sample = read_sample(path)
        gmeans_instance = gmeans(sample, amount, ccore).process()
        centers = gmeans_instance.get_centers()

        closest_clusters = gmeans_instance.predict(points)

        self.assertEqual(len(points), len(closest_clusters))

        for i in range(len(points)):
            cluster_index = closest_clusters[i]
            distance = metric(centers[cluster_index], points[i])
            for center_index in range(len(centers)):
                if center_index != cluster_index:
                    other_distance = metric(centers[center_index], points[i])
                    self.assertLessEqual(distance, other_distance)

Beispiel #10

0

Datei anzeigen

def template_clustering(start_medoids,
                        path,
                        tolerance=0.25,
                        show=True,
                        **kwargs):
    ccore = kwargs.get('ccore', True)
    data_type = kwargs.get('data_type', 'points')

    original_data = read_sample(path)
    sample = original_data
    if data_type == 'distance_matrix':
        sample = calculate_distance_matrix(sample)

    metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)

    kmedoids_instance = kmedoids(sample,
                                 start_medoids,
                                 tolerance,
                                 metric=metric,
                                 ccore=ccore,
                                 data_type=data_type)
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters()
    print("Iterations:", kmedoids_instance.get_iterations())
    print([len(cluster) for cluster in clusters])
    print(clusters)
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, original_data, 0)
        visualizer.append_cluster(
            [original_data[index] for index in start_medoids],
            marker='*',
            markersize=15)
        visualizer.append_cluster(medoids,
                                  data=original_data,
                                  marker='*',
                                  markersize=15)
        visualizer.show()

    return original_data, clusters

Beispiel #11

0

Datei anzeigen

Datei: gmeans.py Projekt: tjufan/pyclustering

    def predict(self, points):
        """!
        @brief Calculates the closest cluster to each point.

        @param[in] points (array_like): Points for which closest clusters are calculated.

        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                 collection if 'process()' method was not called.

        """
        nppoints = numpy.array(points)
        if len(self.__clusters) == 0:
            return []

        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)

        npcenters = numpy.array(self.__centers)
        differences = numpy.zeros((len(nppoints), len(npcenters)))
        for index_point in range(len(nppoints)):
            differences[index_point] = metric(nppoints[index_point], npcenters)

        return numpy.argmin(differences, axis=1)

Beispiel #12

0

Datei anzeigen

Datei: ut_center_initializer.py Projekt: zzp-seeker/pyclustering

 def test_various_data_type_simple5(self):
     for i in range(10):
         self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

Beispiel #13

0

Datei anzeigen

Datei: ut_center_initializer.py Projekt: zzp-seeker/pyclustering

 def test_various_data_type_simple1_euclidean_chebyshev(self):
     self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV))

Beispiel #14

0

Datei anzeigen

Datei: ut_center_initializer.py Projekt: zzp-seeker/pyclustering

 def test_various_data_type_simple1_euclidean_manhattan(self):
     self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN))