def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs): """! @brief Constructor of clustering algorithm K-Medians. @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...]. @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not. @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'itermax'). <b>Keyword Args:</b><br> - metric (distance_metric): Metric that is used for distance calculation between two points. - itermax (uint): Maximum number of iterations for cluster analysis. """ self.__pointer_data = data self.__clusters = [] self.__medians = initial_centers[:] self.__tolerance = tolerance self.__itermax = kwargs.get('itermax', 100) self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) if self.__metric is None: self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED if self.__ccore: self.__ccore = ccore_library.workable()
def clustering(path, amount, threshold, expected, ccore, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); sample = read_sample(path); bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric); bsas_instance.process(); clusters = bsas_instance.get_clusters(); representatives = bsas_instance.get_representatives(); obtained_length = 0; obtained_cluster_length = []; for cluster in clusters: obtained_length += len(cluster); obtained_cluster_length.append(len(cluster)); assertion.eq(len(sample), obtained_length); assertion.eq(len(expected), len(clusters)); assertion.eq(len(expected), len(representatives)); assertion.ge(amount, len(clusters)); dimension = len(sample[0]); for rep in representatives: assertion.eq(dimension, len(rep)); expected.sort(); obtained_cluster_length.sort(); assertion.eq(expected, obtained_cluster_length);
def templateLengthProcessData(path_to_file, start_centers, expected_cluster_length, ccore, **kwargs): sample = read_sample(path_to_file) metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) itermax = kwargs.get('itermax', 200) kmeans_instance = kmeans(sample, start_centers, 0.001, ccore, metric=metric, itermax=itermax) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() wce = kmeans_instance.get_total_wce() if itermax == 0: assertion.eq(start_centers, centers) assertion.eq([], clusters) assertion.eq(0.0, wce) return obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) assertion.eq(len(clusters), len(centers)) for center in centers: assertion.eq(len(sample[0]), len(center)) if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def __init__(self, data, initial_index_medoids, tolerance=0.001, ccore=True, **kwargs): """! @brief Constructor of clustering algorithm K-Medoids. @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data). @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing. @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code. @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax'). <b>Keyword Args:</b><br> - metric (distance_metric): Metric that is used for distance calculation between two points. - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix'). - itermax (uint): Maximum number of iteration for cluster analysis. """ self.__pointer_data = data self.__clusters = [] self.__medoid_indexes = initial_index_medoids self.__tolerance = tolerance self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) self.__data_type = kwargs.get('data_type', 'points') self.__itermax = kwargs.get('itermax', 200) self.__distance_calculator = self.__create_distance_calculator() self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED if self.__ccore: self.__ccore = ccore_library.workable()
def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs): """! @brief Creates classical BSAS algorithm. @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. @param[in] maximum_clusters: Maximum allowable number of clusters that can be allocated during processing. @param[in] threshold: Threshold of dissimilarity (maximum distance) between points. @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving. @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric'). <b>Keyword Args:</b><br> - metric (distance_metric): Metric that is used for distance calculation between two points. """ self._data = data; self._amount = maximum_clusters; self._threshold = threshold; self._metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); self._ccore = ccore and self._metric.get_type() != type_metric.USER_DEFINED; self._clusters = []; self._representatives = []; if self._ccore is True: self._ccore = ccore_library.workable();
def testCalculateMetric(self): assertion.eq(1.0, metric.distance_metric(metric.type_metric.EUCLIDEAN)([0.0, 1.0], [0.0, 0.0])) assertion.eq(4.0, metric.distance_metric(metric.type_metric.EUCLIDEAN_SQUARE)([2.0, 2.0], [4.0, 2.0])) assertion.eq(4.0, metric.distance_metric(metric.type_metric.MANHATTAN)([1.0, 1.0], [-1.0, -1.0])) assertion.eq(2.0, metric.distance_metric(metric.type_metric.CHEBYSHEV)([2.0, -2.0], [0.0, 0.0])) assertion.eq(2.0, metric.distance_metric(metric.type_metric.MINKOWSKI)([-3.0, -3.0], [-5.0, -3.0])) assertion.eq(2.0, metric.distance_metric(metric.type_metric.MINKOWSKI, degree=2)([-3.0, -3.0], [-5.0, -3.0])) assertion.eq(4.0, metric.distance_metric(metric.type_metric.USER_DEFINED, func=metric.euclidean_distance_square)([2.0, 2.0], [4.0, 2.0])) user_function = lambda point1, point2: point1[0] + point2[0] + 2 assertion.eq(5.0, metric.distance_metric(metric.type_metric.USER_DEFINED, func=user_function)([2.0, 3.0], [1.0, 3.0]))
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def template_clustering(path, amount, threshold, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)); ccore = kwargs.get('ccore', False); draw = kwargs.get('draw', True); sample = read_sample(path); print("Sample: ", path); bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric); bsas_instance.process(); clusters = bsas_instance.get_clusters(); representatives = bsas_instance.get_representatives(); if draw is True: bsas_visualizer.show_clusters(sample, clusters, representatives);
def template_clustering(start_centers, path, tolerance = 0.25, ccore = False): sample = read_sample(path) dimension = len(sample[0]) metric = distance_metric(type_metric.MANHATTAN) observer = kmeans_observer() kmeans_instance = kmeans(sample, start_centers, tolerance, ccore, observer=observer, metric=metric) (ticks, _) = timedcall(kmeans_instance.process) clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, sample) visualizer.show() if dimension > 3: kmeans_visualizer.show_clusters(sample, clusters, centers, start_centers) kmeans_visualizer.animate_cluster_allocation(sample, observer)
def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs): """! @brief Constructor of clustering algorithm K-Means. @details Center initializer can be used for creating initial centers, for example, K-Means++ method. @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure. @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...]. @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not. @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax'). <b>Keyword Args:</b><br> - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration. - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance). - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200). @see center_initializer """ self.__pointer_data = numpy.array(data) self.__clusters = [] self.__centers = numpy.array(initial_centers) self.__tolerance = tolerance self.__total_wce = 0 self.__observer = kwargs.get('observer', None) self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) self.__itermax = kwargs.get('itermax', 100) if self.__metric.get_type() != type_metric.USER_DEFINED: self.__metric.enable_numpy_usage() else: self.__metric.disable_numpy_usage() self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED if self.__ccore is True: self.__ccore = ccore_library.workable()
def testMndlClusterAllocationSampleSimple1MetricGowerByCore(self): metric = distance_metric(type_metric.GOWER, data=read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)) XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric)
def testBicClusterAllocationSampleSimple1EuclideanSquareByCore(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, 20, True, metric=metric)
def testClusterAllocationSampleSimple1UserDefinedDistanceMatrixByCore(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
def testClusterAllocationSampleSimple1SquareEuclideanDistanceMatrixByCore(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
def testClusterAllocationSampleSimple1ChebyshevCore(self): metric = distance_metric(type_metric.CHEBYSHEV) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
def testClusterAllocationSampleSimple1UserDefined(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1Minkowski(self): metric = distance_metric(type_metric.MINKOWSKI, degree=2.0) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1Chebyshev(self): metric = distance_metric(type_metric.CHEBYSHEV) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1Manhattan(self): metric = distance_metric(type_metric.MANHATTAN) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1EuclideanSquare(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testPredictFivePointsUserMetric(self): centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]] to_predict = [[0.3, 0.2], [4.1, 1.1], [3.9, 1.1], [2.1, 1.9], [2.1, 4.1]] metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) KmediansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, to_predict, [0, 1, 1, 2, 3], False, metric=metric)
def testClusteringSampleSimple1Euclidean(self): ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN)); ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], True, metric=distance_metric(type_metric.EUCLIDEAN));
def testClusteringSampleSimple1Manhattan(self): bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.MANHATTAN)); bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], True, metric=distance_metric(type_metric.MANHATTAN));
def testClusterAllocationSample1NumpyArrayUserDefined(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) input_data = numpy.array(read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)) initial_centers = numpy.array([[3.7, 5.5], [6.7, 7.5]]) KmediansTestTemplates.templateLengthProcessData(input_data, initial_centers, [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1EuclideanSquareCore(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
def testClusterAllocationSample2NumpyArrayUserDefined(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN_SQUARE)) input_data = numpy.array(read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2)) initial_centers = numpy.array([[3.5, 4.8], [6.9, 7], [7.5, 0.5]]) KmediansTestTemplates.templateLengthProcessData(input_data, initial_centers, [10, 5, 8], False, metric=metric)
def testClusterAllocationSampleSimple1UserDefinedCore(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer( sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore=ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def testClusterAllocationSampleSimple1ChebyshevDistanceMatrixByCore(self): metric = distance_metric(type_metric.CHEBYSHEV) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
def testClusterAllocationSampleSimple1Minkowski(self): metric = distance_metric(type_metric.MINKOWSKI, degree=2.0) kmedoids_test_template.templateLengthProcessWithMetric( SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
def testPredictOnePointUserMetric(self): centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]] metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) KmediansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2]], [0], False, metric=metric)
def testClusterAllocationSampleSimple1Gower(self): metric = distance_metric(type_metric.GOWER, data=read_sample( SIMPLE_SAMPLES.SAMPLE_SIMPLE1)) kmedoids_test_template.templateLengthProcessWithMetric( SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
def testClusterAllocationSampleSimple1ChiSquare(self): metric = distance_metric(type_metric.CHI_SQUARE) KmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], False, metric=metric)
def testClusterAllocationSampleSimple1UserDefined(self): metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) kmedoids_test_template.templateLengthProcessWithMetric( SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
def testClusterAllocationSampleSimple1Chebyshev(self): metric = distance_metric(type_metric.CHEBYSHEV) kmedoids_test_template.templateLengthProcessWithMetric( SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
Y *sqrt((vec1[53]-vec2[53])**2) return 1-exp(sim)/(1+exp(sim)) #tranform data to list names_dt = dt.index.values dt = dt.values.tolist() # print(descriptors_similarity(dt[0],dt[1])) Seeds = [5,10,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000] for Nseeds in Seeds: print("----"+str(Nseeds)+"----") # ### metric = distance_metric(type_metric.USER_DEFINED, func= descriptors_similarity) initial_centers = kmeans_plusplus_initializer(dt, Nseeds).initialize() kmeans_instance = kmeans(dt, initial_centers , metric=metric, itermax = 50) # Run cluster analysis and obtain results. start = time.time() print("hello") kmeans_instance.process() end = time.time() print(end - start) # clusters = kmeans_instance.get_clusters() final_centers = kmeans_instance.get_centers() #names_dt[clusters[i]][j]
def testClusteringSampleSimple1Euclidean(self): mbsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN)); mbsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], False, metric=distance_metric(type_metric.EUCLIDEAN));
def runkmeans(sample, clustnum): global minError initial_centers = kmeans_plusplus_initializer(sample, clustnum).initialize() # user_function = lambda point1, point2: sum(l1 != 12 for l1, l2 in zip(point1, point2)) user_function = lambda point1, point2: np.count_nonzero( np.array(point1) != np.array(point2)) metricUser = distance_metric(type_metric.USER_DEFINED, func=user_function) metric = distance_metric(type_metric.EUCLIDEAN) kmeans_instance = kmeans(sample, initial_centers, metric=metric) print("Centroids: ", kmeans_instance.get_centers()) kmeans_instance.process() clusters = kmeans_instance.get_clusters() print("Output Clusters", clusters) print("Centroids: ", kmeans_instance.get_centers()) print("SSE: ", kmeans_instance.get_total_wce()) origMulitDimen = np.array(sample, dtype=int) # Data read from text file is coordinates for row # Must be transposed in order to obtain proper # characteristic matrix numpyChar = np.transpose(origMulitDimen) mockDataArr = [] # Column Number Tracking for p in range(len(sample)): mockDataArr.append(p) # Column Position Dictionary mockDataPos = {} for l in range(len(sample)): mockDataPos[l] = l # Clusters mockDataClustered = [] # Clusters with Subclusters realmockdata = [] origclustercount = len(clusters) # How many times subclustering will happen # Log 10 of Total Coord Points inception = int(math.log10(len(sample))) for num in range(inception): for part in clusters: newsubclustered = [] for col in part: # Obtaining Cluster newsubclustered.append(origMulitDimen[col]) if len(newsubclustered) > 0: # Retrieving Subclusters neworder = subcluster(newsubclustered) for b in range(len(neworder)): for f in range(len(neworder[b])): neworder[b][f] = part[neworder[b][f]] realmockdata.extend(neworder) if num == inception - 1: # Appending to Final Cluster Order mockDataClustered.append(neworder) # Resetting in case of next subclustering clusters = realmockdata.copy() realmockdata = [] # Tracking each Original Cluster supercluster = [] for i in range(len(mockDataClustered)): supercluster.append(i) # Superclustering based on edges # Current simplistic method is taking first # Cluster and iterating through, finding # which next cluster's left edge matches # best to the current clusters right edge for i in range(0, len(mockDataClustered) - 1): closest = supercluster[i + 1] didchange = closest closestidx = i + 1 mindist = sys.maxsize # Iterating with both value and index for idx, k in enumerate(supercluster[i + 1:]): # Calculate Euclidean Distance curdist = distance.euclidean( sample[(mockDataClustered[i][len(mockDataClustered[i]) - 1][ len(mockDataClustered[i][len(mockDataClustered[i]) - 1]) - 1])], sample[mockDataClustered[idx + i + 1][0][0]]) if curdist < mindist: mindist = curdist closest = k closestidx = idx + i + 1 if didchange == closest: print("Nothing Happened") else: # Swap temp = supercluster[i + 1] supercluster[i + 1] = closest supercluster[closestidx] = temp supermockdata = [] # Compiling Final Clusters for i in supercluster: supermockdata.append(mockDataClustered[i]) # Flattening Cluster Nested Arrays for k in supermockdata: for f in k: realmockdata.extend(f) print("L: ", realmockdata) print("M: ", mockDataArr, "\n") originalSave = np.copy(numpyChar) print("Characteristic Matrix (First Row is Column Numbers)") printNumpy = np.insert(numpyChar, 0, mockDataArr, 0) print(printNumpy, "\n") # Swapping Actual Characteristic Array for i in range(len(mockDataArr) - 1): print("I: " + str(i)) print("RealMockData: ", realmockdata) print("Length: ", len(realmockdata)) # Checking if current position matches with the ideal value that should be there if i != mockDataPos[realmockdata[i]]: # Recording Swaps print("Index: " + str(i) + " Value: " + str(numpyChar[:, i]) + " Swaps With -> " + "Index: " + str(mockDataPos[realmockdata[i]]) + " Value: " + str(numpyChar[:, mockDataPos[realmockdata[i]]])) # Swapping column number array and actual characteristic matrix columns temp = np.copy(numpyChar[:, i]) realTemp = mockDataArr[i] mockDataArr[i] = mockDataArr[mockDataPos[realmockdata[i]]] mockDataArr[mockDataPos[realmockdata[i]]] = realTemp numpyChar[:, i] = numpyChar[:, mockDataPos[realmockdata[i]]] numpyChar[:, mockDataPos[realmockdata[i]]] = temp temp2 = mockDataPos[realmockdata[i]] # Updating Positions mockDataPos[realmockdata[i]] = i mockDataPos[realTemp] = temp2 print("\n\nColumn Positions After Swapping: ", mockDataArr) print( "\n\nFinal Swapped Characteristic Matrix (First Row is Column Numbers)" ) printArray = np.insert(numpyChar, 0, np.array(mockDataArr), 0) print(printArray) # Calculating Error swappederror = calcError(numpyChar) defaulterror = calcError(originalSave) subclusts = 0 for clust in mockDataClustered: subclusts += len(clust) # If error is below previously recorded low, # Show the visual and save it to image if swappederror < minError: fig, ax = plt.subplots(1, 2, figsize=(12, 8)) clusteredcoltext = " " mid = int(len(mockDataClustered) / 2) # Convert subclusters to strings # clusteredcoltext += str(supermockdata[:mid]) # clusteredcoltext += "\n" # clusteredcoltext += str(supermockdata[mid:]) fig.text(.5, .05, 'Clustered Columns: ' + str(supermockdata), ha='center', wrap=True) f = open("clusters.txt", "w+") f.write(str(supermockdata)) fig.text(.5, .15, 'Original Error: ' + str(defaulterror), ha='center') fig.text(.5, .2, 'Clustered Error: ' + str(swappederror), ha='center') fig.suptitle('Sub-Clusters: ' + str(subclusts) + ' Original Cluster Count: ' + str(origclustercount), fontsize=20) # Show black and white representations of characteristic matrix ax[0].imshow(numpyChar, interpolation='nearest', cmap=plt.cm.Greys) ax[1].imshow(originalSave, interpolation='nearest', cmap=plt.cm.Greys) ax[0].title.set_text('Clustered Characteristic Matrix') ax[1].title.set_text('Original Charecteristic Matrix') minError = swappederror # Save New Low Error Run plt.savefig("Winner.png", dpi=300) plt.show()
def testClusteringSampleSimple1EuclideanSquare(self): bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE)); bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [5, 5], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE)); bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 100.0, [10], True, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
def testPredictTwoPointsUserMetric(self): medoids = [4, 12, 25, 37] metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) kmedoids_test_template.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, medoids, [[0.3, 0.2], [2.1, 1.9]], [0, 2], False, metric=metric)
def testClusteringSampleSimple1Chebyshev(self): bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1.0, [5, 5], True, metric=distance_metric(type_metric.CHEBYSHEV)); bsas_test_template.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 10.0, [10], True, metric=distance_metric(type_metric.CHEBYSHEV));
def testPredictFivePointsUserMetric(self): medoids = [4, 12, 25, 37] to_predict = [[0.3, 0.2], [4.1, 1.1], [3.9, 1.1], [2.1, 1.9], [2.1, 4.1]] metric = distance_metric(type_metric.USER_DEFINED, func=distance_metric(type_metric.EUCLIDEAN)) kmedoids_test_template.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, medoids, to_predict, [0, 3, 3, 2, 1], False, metric=metric)
def testMndlClusterAllocationSampleSimple1MetricChiSquareByCore(self): metric = distance_metric(type_metric.CHI_SQUARE) XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric, alpha=0.1, beta=0.1, random_state=1000)
def test_initial_medoids_sample01_euclidean(self): metric = distance_metric(type_metric.EUCLIDEAN) kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric)
def testClusterAllocationSampleSimple1ManhattanCore(self): metric = distance_metric(type_metric.MANHATTAN) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
def test_initial_medoids_sample01_euclidean_square_matrix(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
def testClusterAllocationSampleSimple1MinkowskiCore(self): metric = distance_metric(type_metric.MINKOWSKI, degree=2.0) KmediansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], True, metric=metric)
def testBuildGowerDistanceFromMetricWithData(self): metric = distance_metric(type_metric.GOWER, data=[[-3.0, -3.0], [-4.0, -3.0], [-4.5, -3.0], [-5.0, -3.0]]) ccore_metric = metric_wrapper.create_instance(metric) self.assertEqual(0.5, ccore_metric([-3.0, -3.0], [-5.0, -3.0]))
def testClusterAllocationSampleSimple1EuclideanByCore(self): metric = distance_metric(type_metric.EUCLIDEAN) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True)
def testClusteringSampleSimple1Manhattan(self): ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.MANHATTAN)); ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], False, metric=distance_metric(type_metric.MANHATTAN));
def testClusterAllocationSampleSimple1ManhattanDistanceMatrixByCore(self): metric = distance_metric(type_metric.MANHATTAN) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
def testMndlClusterAllocationSampleSimple1MetricMinkowski4ByCore(self): metric = distance_metric(type_metric.MINKOWSKI, degree=4) XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, 20, True, metric=metric)
def testClusterAllocationSampleSimple1MinkowskiDistanceMatrixByCore(self): metric = distance_metric(type_metric.MINKOWSKI, degree=2.0) KmedoidsTestTemplates.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, True, data_type='distance_matrix')
def testClusterAllocationSampleSimple1Manhattan(self): metric = distance_metric(type_metric.MANHATTAN) kmedoids_test_template.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False)
def test_initial_medoids_sample01_euclidean_manhattan_matrix(self): metric = distance_metric(type_metric.MANHATTAN) kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
def testBuildGowerDistanceFromMetricWithNumpyMaxRange(self): metric = distance_metric(type_metric.GOWER, max_range=numpy.array([2.0, 0.0])) ccore_metric = metric_wrapper.create_instance(metric) self.assertEqual(0.5, ccore_metric([-3.0, -3.0], [-5.0, -3.0]))
def testClusteringSampleSimple1EuclideanSquare(self): ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE)); ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [5, 5], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE)); ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 100.0, 200.0, [10], False, metric=distance_metric(type_metric.EUCLIDEAN_SQUARE));
def test_initial_medoids_sample01_euclidean_chebyshev_matrix(self): metric = distance_metric(type_metric.CHEBYSHEV) kmedoids_test_template.initialize_medoids(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, [4, 8], False, metric=metric, data_type='distance_matrix')
def testClusteringSampleSimple1Chebyshev(self): ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 1.0, 2.0, [5, 5], False, metric=distance_metric(type_metric.CHEBYSHEV)); ttsas_test.clustering(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 10.0, 20.0, [10], False, metric=distance_metric(type_metric.CHEBYSHEV));
def testClusterAllocationSampleSimple1SquareEuclideanDistanceMatrix(self): metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) kmedoids_test_template.templateLengthProcessWithMetric(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [2, 9], [5, 5], metric, False, data_type='distance_matrix')