Beispiel #1
0
    def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
        """!
        @brief Constructor of clustering algorithm K-Medians.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax').

        <b>Keyword Args:</b><br>
            - metric (distance_metric): Metric that is used for distance calculation between two points.
            - itermax (uint): Maximum number of iterations for cluster analysis.
        
        """
        self.__pointer_data = data
        self.__clusters = []
        self.__medians = initial_centers[:]
        self.__tolerance = tolerance

        self.__itermax = kwargs.get('itermax', 100)
        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        if self.__metric is None:
            self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
        if self.__ccore:
            self.__ccore = ccore_library.workable()
    def clustering(path, amount, threshold, expected, ccore, **kwargs):
        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN));

        sample = read_sample(path);

        bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric);
        bsas_instance.process();

        clusters = bsas_instance.get_clusters();
        representatives = bsas_instance.get_representatives();

        obtained_length = 0;
        obtained_cluster_length = [];
        for cluster in clusters:
            obtained_length += len(cluster);
            obtained_cluster_length.append(len(cluster));

        assertion.eq(len(sample), obtained_length);
        assertion.eq(len(expected), len(clusters));
        assertion.eq(len(expected), len(representatives));
        assertion.ge(amount, len(clusters));

        dimension = len(sample[0]);
        for rep in representatives:
            assertion.eq(dimension, len(rep));

        expected.sort();
        obtained_cluster_length.sort();

        assertion.eq(expected, obtained_cluster_length);
    def templateLengthProcessData(path_to_file, start_centers, expected_cluster_length, ccore, **kwargs):
        sample = read_sample(path_to_file)

        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        itermax = kwargs.get('itermax', 200)
        
        kmeans_instance = kmeans(sample, start_centers, 0.001, ccore, metric=metric, itermax=itermax)
        kmeans_instance.process()
        
        clusters = kmeans_instance.get_clusters()
        centers = kmeans_instance.get_centers()
        wce = kmeans_instance.get_total_wce()

        if itermax == 0:
            assertion.eq(start_centers, centers)
            assertion.eq([], clusters)
            assertion.eq(0.0, wce)
            return

        obtained_cluster_sizes = [len(cluster) for cluster in clusters]
        assertion.eq(len(sample), sum(obtained_cluster_sizes))
        
        assertion.eq(len(clusters), len(centers))
        for center in centers:
            assertion.eq(len(sample[0]), len(center))
        
        if expected_cluster_length is not None:
            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()
            assertion.eq(obtained_cluster_sizes, expected_cluster_length)
Beispiel #4
0
    def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs):
        """!
        @brief Constructor of clustering algorithm K-Medoids.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').

        <b>Keyword Args:</b><br>
            - metric (distance_metric): Metric that is used for distance calculation between two points.
            - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
            - itermax (uint): Maximum number of iteration for cluster analysis.

        """
        self.__pointer_data = data
        self.__clusters = []
        self.__medoid_indexes = initial_index_medoids
        self.__tolerance = tolerance

        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.__data_type = kwargs.get('data_type', 'points')
        self.__itermax = kwargs.get('itermax', 200)

        self.__distance_calculator = self.__create_distance_calculator()

        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
        if self.__ccore:
            self.__ccore = ccore_library.workable()
Beispiel #5
0
    def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs):
        """!
        @brief Creates classical BSAS algorithm.

        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] maximum_clusters: Maximum allowable number of clusters that can be allocated during processing.
        @param[in] threshold: Threshold of dissimilarity (maximum distance) between points.
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').

        <b>Keyword Args:</b><br>
            - metric (distance_metric): Metric that is used for distance calculation between two points.

        """

        self._data = data;
        self._amount = maximum_clusters;
        self._threshold = threshold;
        self._metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN));
        self._ccore = ccore and self._metric.get_type() != type_metric.USER_DEFINED;

        self._clusters = [];
        self._representatives = [];

        if self._ccore is True:
            self._ccore = ccore_library.workable();
Beispiel #6
0
    def testCalculateMetric(self):
        assertion.eq(1.0, metric.distance_metric(metric.type_metric.EUCLIDEAN)([0.0, 1.0], [0.0, 0.0]))
        assertion.eq(4.0, metric.distance_metric(metric.type_metric.EUCLIDEAN_SQUARE)([2.0, 2.0], [4.0, 2.0]))
        assertion.eq(4.0, metric.distance_metric(metric.type_metric.MANHATTAN)([1.0, 1.0], [-1.0, -1.0]))
        assertion.eq(2.0, metric.distance_metric(metric.type_metric.CHEBYSHEV)([2.0, -2.0], [0.0, 0.0]))
        assertion.eq(2.0, metric.distance_metric(metric.type_metric.MINKOWSKI)([-3.0, -3.0], [-5.0, -3.0]))
        assertion.eq(2.0, metric.distance_metric(metric.type_metric.MINKOWSKI, degree=2)([-3.0, -3.0], [-5.0, -3.0]))
        assertion.eq(4.0, metric.distance_metric(metric.type_metric.USER_DEFINED, func=metric.euclidean_distance_square)([2.0, 2.0], [4.0, 2.0]))

        user_function = lambda point1, point2: point1[0] + point2[0] + 2
        assertion.eq(5.0, metric.distance_metric(metric.type_metric.USER_DEFINED, func=user_function)([2.0, 3.0], [1.0, 3.0]))
    def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Beispiel #8
0
def template_clustering(path, amount, threshold, **kwargs):
    metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE));
    ccore = kwargs.get('ccore', False);
    draw = kwargs.get('draw', True);

    sample = read_sample(path);

    print("Sample: ", path);

    bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric);
    bsas_instance.process();

    clusters = bsas_instance.get_clusters();
    representatives = bsas_instance.get_representatives();

    if draw is True:
        bsas_visualizer.show_clusters(sample, clusters, representatives);
def template_clustering(start_centers, path, tolerance = 0.25, ccore = False):
    sample = read_sample(path)
    dimension = len(sample[0])

    metric = distance_metric(type_metric.MANHATTAN)

    observer = kmeans_observer()
    kmeans_instance = kmeans(sample, start_centers, tolerance, ccore, observer=observer, metric=metric)
    (ticks, _) = timedcall(kmeans_instance.process)
    
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()
    
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    visualizer = cluster_visualizer_multidim()
    visualizer.append_clusters(clusters, sample)
    visualizer.show()

    if dimension > 3:
        kmeans_visualizer.show_clusters(sample, clusters, centers, start_centers)
        kmeans_visualizer.animate_cluster_allocation(sample, observer)
Beispiel #10
0
    def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
        """!
        @brief Constructor of clustering algorithm K-Means.
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
        
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
        
        <b>Keyword Args:</b><br>
            - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
            - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
            - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
        
        @see center_initializer
        
        """
        self.__pointer_data = numpy.array(data)
        self.__clusters = []
        self.__centers = numpy.array(initial_centers)
        self.__tolerance = tolerance
        self.__total_wce = 0

        self.__observer = kwargs.get('observer', None)
        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        self.__itermax = kwargs.get('itermax', 100)

        if self.__metric.get_type() != type_metric.USER_DEFINED:
            self.__metric.enable_numpy_usage()
        else:
            self.__metric.disable_numpy_usage()
        
        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
        if self.__ccore is True:
            self.__ccore = ccore_library.workable()
 def testMndlClusterAllocationSampleSimple1MetricGowerByCore(self):
     metric = distance_metric(type_metric.GOWER, data=read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1))
     XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric)
 def testBicClusterAllocationSampleSimple1EuclideanSquareByCore(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, 20, True, metric=metric)
Beispiel #13
0
 def testClusterAllocationSampleSimple1UserDefinedDistanceMatrixByCore(self):
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
Beispiel #14
0
 def testClusterAllocationSampleSimple1SquareEuclideanDistanceMatrixByCore(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
Beispiel #15
0
 def testClusterAllocationSampleSimple1ChebyshevCore(self):
     metric = distance_metric(type_metric.CHEBYSHEV)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
 def testClusterAllocationSampleSimple1UserDefined(self):
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
 def testClusterAllocationSampleSimple1Minkowski(self):
     metric = distance_metric(type_metric.MINKOWSKI, degree=2.0)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
 def testClusterAllocationSampleSimple1Chebyshev(self):
     metric = distance_metric(type_metric.CHEBYSHEV)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
 def testClusterAllocationSampleSimple1Manhattan(self):
     metric = distance_metric(type_metric.MANHATTAN)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
 def testClusterAllocationSampleSimple1EuclideanSquare(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
 def testPredictFivePointsUserMetric(self):
     centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
     to_predict = [[0.3, 0.2], [4.1, 1.1], [3.9, 1.1], [2.1, 1.9], [2.1, 4.1]]
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     KmediansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, to_predict, [0, 1, 1, 2, 3], False, metric=metric)
Beispiel #22
0
 def testClusteringSampleSimple1Euclidean(self):
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN));
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], True, metric=distance_metric(type_metric.EUCLIDEAN));
Beispiel #23
0
 def testClusteringSampleSimple1Manhattan(self):
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.MANHATTAN));
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], True, metric=distance_metric(type_metric.MANHATTAN));
 def testClusterAllocationSample1NumpyArrayUserDefined(self):
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     input_data = numpy.array(read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1))
     initial_centers = numpy.array([[3.7, 5.5], [6.7, 7.5]])
     KmediansTestTemplates.templateLengthProcessData(input_data, initial_centers, [5, 5], False, metric=metric)
Beispiel #25
0
 def testClusterAllocationSampleSimple1EuclideanSquareCore(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
 def testClusterAllocationSample2NumpyArrayUserDefined(self):
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN_SQUARE))
     input_data = numpy.array(read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2))
     initial_centers = numpy.array([[3.5, 4.8], [6.9, 7], [7.5, 0.5]])
     KmediansTestTemplates.templateLengthProcessData(input_data, initial_centers, [10, 5, 8], False, metric=metric)
Beispiel #27
0
 def testClusterAllocationSampleSimple1UserDefinedCore(self):
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
Beispiel #28
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids,
                                        expected_cluster_length, metric,
                                        ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(
                    sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data,
                                         initial_medoids,
                                         0.001,
                                         ccore=ccore_flag,
                                         metric=metric,
                                         data_type=data_type,
                                         itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Beispiel #29
0
 def testClusterAllocationSampleSimple1ChebyshevDistanceMatrixByCore(self):
     metric = distance_metric(type_metric.CHEBYSHEV)
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
Beispiel #30
0
 def testClusterAllocationSampleSimple1Minkowski(self):
     metric = distance_metric(type_metric.MINKOWSKI, degree=2.0)
     kmedoids_test_template.templateLengthProcessWithMetric(
         SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
 def testPredictOnePointUserMetric(self):
     centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     KmediansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2]], [0], False, metric=metric)
Beispiel #32
0
 def testClusterAllocationSampleSimple1Gower(self):
     metric = distance_metric(type_metric.GOWER,
                              data=read_sample(
                                  SIMPLE_SAMPLES.SAMPLE_SIMPLE1))
     kmedoids_test_template.templateLengthProcessWithMetric(
         SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
Beispiel #33
0
 def testClusterAllocationSampleSimple1ChiSquare(self):
     metric = distance_metric(type_metric.CHI_SQUARE)
     KmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
Beispiel #34
0
 def testClusterAllocationSampleSimple1UserDefined(self):
     metric = distance_metric(type_metric.USER_DEFINED,
                              func=distance_metric(type_metric.EUCLIDEAN))
     kmedoids_test_template.templateLengthProcessWithMetric(
         SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
Beispiel #35
0
 def testClusterAllocationSampleSimple1Chebyshev(self):
     metric = distance_metric(type_metric.CHEBYSHEV)
     kmedoids_test_template.templateLengthProcessWithMetric(
         SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
Beispiel #36
0
    Y *sqrt((vec1[53]-vec2[53])**2)
    return 1-exp(sim)/(1+exp(sim))

#tranform data to list
names_dt = dt.index.values
dt = dt.values.tolist()
#


print(descriptors_similarity(dt[0],dt[1]))
Seeds = [5,10,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000]
for Nseeds in Seeds:
    print("----"+str(Nseeds)+"----")
    #
    ###
    metric = distance_metric(type_metric.USER_DEFINED, func= descriptors_similarity)
    initial_centers  = kmeans_plusplus_initializer(dt, Nseeds).initialize()
    kmeans_instance = kmeans(dt, initial_centers , metric=metric, itermax = 50)
    # Run cluster analysis and obtain results.
    start = time.time()
    print("hello")
    kmeans_instance.process()
    end = time.time()
    print(end - start)
    #
    clusters = kmeans_instance.get_clusters()
    final_centers = kmeans_instance.get_centers()
    
    
    #names_dt[clusters[i]][j]
    
Beispiel #37
0
 def testClusteringSampleSimple1Euclidean(self):
     mbsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN));
     mbsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], False, metric=distance_metric(type_metric.EUCLIDEAN));
Beispiel #38
0
def runkmeans(sample, clustnum):
    global minError

    initial_centers = kmeans_plusplus_initializer(sample,
                                                  clustnum).initialize()

    # user_function = lambda point1, point2: sum(l1 != 12 for l1, l2 in zip(point1, point2))

    user_function = lambda point1, point2: np.count_nonzero(
        np.array(point1) != np.array(point2))

    metricUser = distance_metric(type_metric.USER_DEFINED, func=user_function)

    metric = distance_metric(type_metric.EUCLIDEAN)

    kmeans_instance = kmeans(sample, initial_centers, metric=metric)
    print("Centroids: ", kmeans_instance.get_centers())

    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()

    print("Output Clusters", clusters)

    print("Centroids: ", kmeans_instance.get_centers())

    print("SSE: ", kmeans_instance.get_total_wce())

    origMulitDimen = np.array(sample, dtype=int)
    # Data read from text file is coordinates for row
    # Must be transposed in order to obtain proper
    # characteristic matrix
    numpyChar = np.transpose(origMulitDimen)

    mockDataArr = []
    # Column Number Tracking
    for p in range(len(sample)):
        mockDataArr.append(p)
    # Column Position Dictionary
    mockDataPos = {}
    for l in range(len(sample)):
        mockDataPos[l] = l
    # Clusters
    mockDataClustered = []
    # Clusters with Subclusters
    realmockdata = []

    origclustercount = len(clusters)
    # How many times subclustering will happen
    # Log 10 of Total Coord Points
    inception = int(math.log10(len(sample)))
    for num in range(inception):
        for part in clusters:
            newsubclustered = []
            for col in part:
                # Obtaining Cluster
                newsubclustered.append(origMulitDimen[col])
            if len(newsubclustered) > 0:
                # Retrieving Subclusters
                neworder = subcluster(newsubclustered)
                for b in range(len(neworder)):
                    for f in range(len(neworder[b])):
                        neworder[b][f] = part[neworder[b][f]]
                realmockdata.extend(neworder)
                if num == inception - 1:
                    # Appending to Final Cluster Order
                    mockDataClustered.append(neworder)
        # Resetting in case of next subclustering
        clusters = realmockdata.copy()
        realmockdata = []
    # Tracking each Original Cluster
    supercluster = []
    for i in range(len(mockDataClustered)):
        supercluster.append(i)
    # Superclustering based on edges
    # Current simplistic method is taking first
    # Cluster and iterating through, finding
    # which next cluster's left edge matches
    # best to the current clusters right edge
    for i in range(0, len(mockDataClustered) - 1):
        closest = supercluster[i + 1]
        didchange = closest
        closestidx = i + 1
        mindist = sys.maxsize
        # Iterating with both value and index
        for idx, k in enumerate(supercluster[i + 1:]):
            # Calculate Euclidean Distance
            curdist = distance.euclidean(
                sample[(mockDataClustered[i][len(mockDataClustered[i]) - 1][
                    len(mockDataClustered[i][len(mockDataClustered[i]) - 1]) -
                    1])], sample[mockDataClustered[idx + i + 1][0][0]])
            if curdist < mindist:
                mindist = curdist
                closest = k
                closestidx = idx + i + 1
        if didchange == closest:
            print("Nothing Happened")
        else:
            # Swap
            temp = supercluster[i + 1]
            supercluster[i + 1] = closest
            supercluster[closestidx] = temp

    supermockdata = []
    # Compiling Final Clusters
    for i in supercluster:
        supermockdata.append(mockDataClustered[i])
    # Flattening Cluster Nested Arrays
    for k in supermockdata:
        for f in k:
            realmockdata.extend(f)
    print("L: ", realmockdata)
    print("M: ", mockDataArr, "\n")

    originalSave = np.copy(numpyChar)

    print("Characteristic Matrix (First Row is Column Numbers)")
    printNumpy = np.insert(numpyChar, 0, mockDataArr, 0)
    print(printNumpy, "\n")
    # Swapping Actual Characteristic Array
    for i in range(len(mockDataArr) - 1):
        print("I: " + str(i))
        print("RealMockData: ", realmockdata)
        print("Length: ", len(realmockdata))
        # Checking if current position matches with the ideal value that should be there
        if i != mockDataPos[realmockdata[i]]:
            # Recording Swaps
            print("Index: " + str(i) + "    Value: " + str(numpyChar[:, i]) +
                  "  Swaps With -> " + "Index: " +
                  str(mockDataPos[realmockdata[i]]) + "  Value: " +
                  str(numpyChar[:, mockDataPos[realmockdata[i]]]))
            # Swapping column number array and actual characteristic matrix columns
            temp = np.copy(numpyChar[:, i])

            realTemp = mockDataArr[i]

            mockDataArr[i] = mockDataArr[mockDataPos[realmockdata[i]]]

            mockDataArr[mockDataPos[realmockdata[i]]] = realTemp

            numpyChar[:, i] = numpyChar[:, mockDataPos[realmockdata[i]]]

            numpyChar[:, mockDataPos[realmockdata[i]]] = temp

            temp2 = mockDataPos[realmockdata[i]]
            # Updating Positions
            mockDataPos[realmockdata[i]] = i

            mockDataPos[realTemp] = temp2

    print("\n\nColumn Positions After Swapping: ", mockDataArr)

    print(
        "\n\nFinal Swapped Characteristic Matrix (First Row is Column Numbers)"
    )
    printArray = np.insert(numpyChar, 0, np.array(mockDataArr), 0)

    print(printArray)
    # Calculating Error
    swappederror = calcError(numpyChar)
    defaulterror = calcError(originalSave)
    subclusts = 0
    for clust in mockDataClustered:
        subclusts += len(clust)
    # If error is below previously recorded low,
    # Show the visual and save it to image
    if swappederror < minError:
        fig, ax = plt.subplots(1, 2, figsize=(12, 8))
        clusteredcoltext = " "
        mid = int(len(mockDataClustered) / 2)

        # Convert subclusters to strings
        # clusteredcoltext += str(supermockdata[:mid])
        # clusteredcoltext += "\n"
        # clusteredcoltext += str(supermockdata[mid:])

        fig.text(.5,
                 .05,
                 'Clustered Columns: ' + str(supermockdata),
                 ha='center',
                 wrap=True)
        f = open("clusters.txt", "w+")
        f.write(str(supermockdata))
        fig.text(.5, .15, 'Original Error: ' + str(defaulterror), ha='center')
        fig.text(.5, .2, 'Clustered Error: ' + str(swappederror), ha='center')
        fig.suptitle('Sub-Clusters: ' + str(subclusts) +
                     '  Original Cluster Count: ' + str(origclustercount),
                     fontsize=20)
        # Show black and white representations of characteristic matrix

        ax[0].imshow(numpyChar, interpolation='nearest', cmap=plt.cm.Greys)

        ax[1].imshow(originalSave, interpolation='nearest', cmap=plt.cm.Greys)

        ax[0].title.set_text('Clustered Characteristic Matrix')

        ax[1].title.set_text('Original Charecteristic Matrix')

        minError = swappederror
        # Save New Low Error Run
        plt.savefig("Winner.png", dpi=300)
        plt.show()
Beispiel #39
0
 def testClusteringSampleSimple1EuclideanSquare(self):
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 100.0, [10], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
Beispiel #40
0
 def testPredictTwoPointsUserMetric(self):
     medoids = [4, 12, 25, 37]
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     kmedoids_test_template.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, medoids, [[0.3, 0.2], [2.1, 1.9]], [0, 2], False, metric=metric)
Beispiel #41
0
 def testClusteringSampleSimple1Chebyshev(self):
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.CHEBYSHEV));
     bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], True, metric=distance_metric(type_metric.CHEBYSHEV));
Beispiel #42
0
 def testPredictFivePointsUserMetric(self):
     medoids = [4, 12, 25, 37]
     to_predict = [[0.3, 0.2], [4.1, 1.1], [3.9, 1.1], [2.1, 1.9], [2.1, 4.1]]
     metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN))
     kmedoids_test_template.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, medoids, to_predict, [0, 3, 3, 2, 1], False, metric=metric)
 def testMndlClusterAllocationSampleSimple1MetricChiSquareByCore(self):
     metric = distance_metric(type_metric.CHI_SQUARE)
     XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric, alpha=0.1, beta=0.1, random_state=1000)
Beispiel #44
0
 def test_initial_medoids_sample01_euclidean(self):
     metric = distance_metric(type_metric.EUCLIDEAN)
     kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric)
Beispiel #45
0
 def testClusterAllocationSampleSimple1ManhattanCore(self):
     metric = distance_metric(type_metric.MANHATTAN)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
Beispiel #46
0
 def test_initial_medoids_sample01_euclidean_square_matrix(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
Beispiel #47
0
 def testClusterAllocationSampleSimple1MinkowskiCore(self):
     metric = distance_metric(type_metric.MINKOWSKI, degree=2.0)
     KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
Beispiel #48
0
 def testBuildGowerDistanceFromMetricWithData(self):
     metric = distance_metric(type_metric.GOWER,
                              data=[[-3.0, -3.0], [-4.0, -3.0],
                                    [-4.5, -3.0], [-5.0, -3.0]])
     ccore_metric = metric_wrapper.create_instance(metric)
     self.assertEqual(0.5, ccore_metric([-3.0, -3.0], [-5.0, -3.0]))
Beispiel #49
0
 def testClusterAllocationSampleSimple1EuclideanByCore(self):
     metric = distance_metric(type_metric.EUCLIDEAN)
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True)
Beispiel #50
0
 def testClusteringSampleSimple1Manhattan(self):
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.MANHATTAN));
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], False, metric=distance_metric(type_metric.MANHATTAN));
Beispiel #51
0
 def testClusterAllocationSampleSimple1ManhattanDistanceMatrixByCore(self):
     metric = distance_metric(type_metric.MANHATTAN)
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
 def testMndlClusterAllocationSampleSimple1MetricMinkowski4ByCore(self):
     metric = distance_metric(type_metric.MINKOWSKI, degree=4)
     XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric)
Beispiel #53
0
 def testClusterAllocationSampleSimple1MinkowskiDistanceMatrixByCore(self):
     metric = distance_metric(type_metric.MINKOWSKI, degree=2.0)
     KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
Beispiel #54
0
 def testClusterAllocationSampleSimple1Manhattan(self):
     metric = distance_metric(type_metric.MANHATTAN)
     kmedoids_test_template.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
Beispiel #55
0
 def test_initial_medoids_sample01_euclidean_manhattan_matrix(self):
     metric = distance_metric(type_metric.MANHATTAN)
     kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
Beispiel #56
0
 def testBuildGowerDistanceFromMetricWithNumpyMaxRange(self):
     metric = distance_metric(type_metric.GOWER,
                              max_range=numpy.array([2.0, 0.0]))
     ccore_metric = metric_wrapper.create_instance(metric)
     self.assertEqual(0.5, ccore_metric([-3.0, -3.0], [-5.0, -3.0]))
Beispiel #57
0
 def testClusteringSampleSimple1EuclideanSquare(self):
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 100.0, 200.0, [10], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
Beispiel #58
0
 def test_initial_medoids_sample01_euclidean_chebyshev_matrix(self):
     metric = distance_metric(type_metric.CHEBYSHEV)
     kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
Beispiel #59
0
 def testClusteringSampleSimple1Chebyshev(self):
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.CHEBYSHEV));
     ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], False, metric=distance_metric(type_metric.CHEBYSHEV));
Beispiel #60
0
 def testClusterAllocationSampleSimple1SquareEuclideanDistanceMatrix(self):
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
     kmedoids_test_template.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False, data_type='distance_matrix')