def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = True): """! @brief Constructor of clustering algorithm BIRCH. @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple. @param[in] number_clusters (uint): Number of clusters that should be allocated. @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt. @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded. @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem. @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation. """ self.__pointer_data = data; self.__number_clusters = number_clusters; self.__measurement_type = type_measurement; self.__entry_size_limit = entry_size_limit; self.__diameter_multiplier = diameter_multiplier; self.__ccore = ccore; self.__features = None; self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement); self.__clusters = []; self.__noise = [];
def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, ccore = False): """! @brief Constructor of clustering algorithm BIRCH. @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple. @param[in] number_clusters (uint): Number of clusters that should be allocated. @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt. @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem. @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation. Example: @code birch_instance1 = birch(sample1, 2); # two clusters should be allocated birch_instance2 = birch(sample2, 5); # five clusters should be allocated # three clusters should be allocated, but also each leaf node can have maximum 5 # entries and each entry can have maximum 5 descriptors with initial diameter 0.05. birch_instance3 = birch(sample3, 3, 5, 5, 0.05); @endcode """ self.__pointer_data = data; self.__number_clusters = number_clusters; self.__measurement_type = type_measurement; self.__entry_size_limit = entry_size_limit; self.__ccore = ccore; self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
def __rebuild_tree(self, index_point): rebuild_result = False increased_diameter = self.__tree.threshold * self.__diameter_multiplier tree = None while rebuild_result is False: # increase diameter and rebuild tree if increased_diameter == 0.0: increased_diameter = 1.0 # build tree with update parameters tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement) for index_point in range(0, index_point + 1): point = self.__pointer_data[index_point] tree.insert_cluster([point]) if tree.amount_entries > self.__entry_size_limit: increased_diameter *= self.__diameter_multiplier continue # Re-build is successful. rebuild_result = True return tree
def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=500, diameter_multiplier=1.5, ccore=True): self.__pointer_data = data self.__number_clusters = number_clusters self.__measurement_type = type_measurement self.__entry_size_limit = entry_size_limit self.__diameter_multiplier = diameter_multiplier self.__ccore = ccore self.__verify_arguments() self.__features = None self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement) self.__clusters = [] self.__noise = []
def templateCfTreeTotalNumberPoints(self, number_points, dimension, branching_factor, number_entries, diameter): tree = cftree(branching_factor, number_entries, diameter) for index_point in range(0, number_points): point = [index_point for i in range(0, dimension)] tree.insert_cluster([point]) number_points = 0 for leaf in tree.leafes: number_points += leaf.feature.number_points assert (index_point + 1) == number_points number_leaf_points = 0 for leaf in tree.leafes: number_leaf_points += leaf.feature.number_points assert number_points == tree.root.feature.number_points if (number_points != number_leaf_points): print(number_points, number_leaf_points) assert number_points == number_leaf_points
def __rebuild_tree(self, index_point): """! @brief Rebuilt tree in case of maxumum number of entries is exceeded. @param[in] index_point (uint): Index of point that is used as end point of re-building. @return (cftree) Rebuilt tree with encoded points till specified point from input data space. """ rebuild_result = False; increased_diameter = self.__tree.threshold * 1.5; tree = None; while(rebuild_result is False): # increase diameter and rebuild tree if (increased_diameter == 0.0): increased_diameter = 1.0; # build tree with update parameters tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement); for index_point in range(0, index_point + 1): point = self.__pointer_data[index_point]; tree.insert_cluster([point], self.__hadoop_address); if (tree.amount_entries > self.__entry_size_limit): increased_diameter *= 1.5; continue; # Re-build is successful. rebuild_result = True; return tree;
def __rebuild_tree(self, index_point): """! @brief Rebuilt tree in case of maxumum number of entries is exceeded. @param[in] index_point (uint): Index of point that is used as end point of re-building. @return (cftree) Rebuilt tree with encoded points till specified point from input data space. """ rebuild_result = False; increased_diameter = self.__tree.threshold * self.__diameter_multiplier; tree = None; while(rebuild_result is False): # increase diameter and rebuild tree if (increased_diameter == 0.0): increased_diameter = 1.0; # build tree with update parameters tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement); for index_point in range(0, index_point + 1): point = self.__pointer_data[index_point]; tree.insert_cluster([point]); if (tree.amount_entries > self.__entry_size_limit): increased_diameter *= self.__diameter_multiplier; continue; # Re-build is successful. rebuild_result = True; return tree;
def templateTreeHeight(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1); for index_point in range(0, number_points): point = [ index_point ]; tree.insert_cluster([ point ]); assert math.floor(math.log(number_points, branching_factor)) <= tree.height;
def templateTreeHeight(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1) for index_point in range(0, number_points): point = [index_point] tree.insert_point(point) assert math.floor(math.log(number_points, branching_factor)) <= tree.height;
def testCfTreeCreationWithoutMerging(self): clusters = [ [ [random() + j, random() + j] for i in range(10) ] for j in range(10) ]; tree = cftree(2, 1, 0.0); for cluster in clusters: tree.insert_cluster(cluster); assert tree.height >= 4; assert tree.amount_entries == 10; assert len(tree.leafes) == 10;
def templateCorrectEntryDiameter(self, sample_path, branching_factor, diameter): sample = read_sample(sample_path) tree = cftree(branching_factor, 100, diameter) for index_point in range(len(sample)): tree.insert_point(sample[index_point]) leaf_nodes = tree.leafes for node in leaf_nodes: for entry in node.entries: self.assertLessEqual(entry.get_diameter(), diameter)
def testCfTreeCreationWithoutMerging(self): clusters = [[[random() + j, random() + j] for _ in range(10)] for j in range(10)] tree = cftree(2, 1, 0.0) for cluster in clusters: for point in cluster: tree.insert_point(point) assert tree.height >= 4 self.assertEqual(tree.amount_entries, 100) self.assertEqual(len(tree.leafes), 100)
def testCfTreeCreationWithOneEntry(self): tree = cftree(2, 1, 1.0) entry = cfentry(5, [0.0, 0.1], 0.05) tree.insert(entry) assert 1 == tree.amount_nodes assert 1 == tree.height assert 1 == tree.amount_entries assert entry == tree.root.feature assert None == tree.root.parent
def testCfTreeCreationWithOneEntry(self): tree = cftree(2, 1, 1.0); entry = cfentry(5, [0.0, 0.1], 0.05); tree.insert(entry); assert 1 == tree.amount_nodes; assert 1 == tree.height; assert 1 == tree.amount_entries; assert entry == tree.root.feature; assert None == tree.root.parent;
def templateCfTreeLeafIntegrity(self, number_clusters, branching_factor, max_entries, threshold): clusters = [ [ [random() + j, random() + j] for i in range(10) ] for j in range(number_clusters) ]; tree = cftree(branching_factor, max_entries, threshold); for index_cluster in range(0, len(clusters)): tree.insert_cluster(clusters[index_cluster]); result_searching = False; for leaf in tree.leafes: for node_entry in leaf.entries: result_searching |= (node_entry == node_entry); assert True == result_searching;
def templateLevelNodeObtaining(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1); for index_point in range(0, number_points): point = [ index_point ]; tree.insert_cluster([ point ]); total_node_amount = 0; for level in range(0, tree.height): nodes = tree.get_level_nodes(level); total_node_amount += len(nodes); assert tree.amount_nodes == total_node_amount;
def templateLevelNodeObtaining(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1) for index_point in range(0, number_points): point = [index_point] tree.insert_cluster([point]) total_node_amount = 0 for level in range(0, tree.height): nodes = tree.get_level_nodes(level) total_node_amount += len(nodes) assert tree.amount_nodes == total_node_amount
def templateLeafNodeAndEntriesAmount(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1); current_size = 0; for index_point in range(0, number_points): point = [ index_point ]; tree.insert_cluster([ point ]); current_size += 1; assert current_size == tree.amount_entries; assert current_size == len(tree.leafes); assert number_points == tree.amount_entries; assert number_points == len(tree.leafes);
def templateLeafNodeAndEntriesAmount(self, number_points, branching_factor): tree = cftree(branching_factor, 1, 0.1) current_size = 0 for index_point in range(0, number_points): point = [index_point] tree.insert_point(point) current_size += 1 assert current_size == tree.amount_entries assert current_size == len(tree.leafes) assert number_points == tree.amount_entries assert number_points == len(tree.leafes)
def testCfTreeInserionOneLeafThreeEntries(self): cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]]; cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]]; cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]]; tree = cftree(3, 4, 0.0); tree.insert_cluster(cluster1); tree.insert_cluster(cluster2); tree.insert_cluster(cluster3); entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3)); assert tree.find_nearest_leaf(entry1) == tree.find_nearest_leaf(entry2); assert tree.find_nearest_leaf(entry2) == tree.find_nearest_leaf(entry3);
def testCfTreeEntryAbsorbing(self): tree = cftree(2, 1, 10000.0); absorbing_entry = cfentry(0, [0.0, 0.0], 0.0); for offset in range(0, 10): cluster = [ [random() + offset, random() + offset] for i in range(10)]; entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)); absorbing_entry += entry; tree.insert(entry); assert 1 == tree.amount_entries; assert 1 == tree.amount_nodes; assert 1 == tree.height; assert None == tree.root.parent; assert absorbing_entry == tree.root.feature;
def testCfTreeEntryAbsorbing(self): tree = cftree(2, 1, 10000.0) absorbing_entry = cfentry(0, [0.0, 0.0], 0.0) for offset in range(0, 10): cluster = [[random() + offset, random() + offset] for i in range(10)] entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) absorbing_entry += entry tree.insert(entry) assert 1 == tree.amount_entries assert 1 == tree.amount_nodes assert 1 == tree.height assert None == tree.root.parent assert absorbing_entry == tree.root.feature
def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=500, diameter_multiplier=1.5, ccore=True): """! @brief Constructor of clustering algorithm BIRCH. @param[in] data (list): An input data represented as a list of points (objects) where each point is be represented by list of coordinates. @param[in] number_clusters (uint): Amount of clusters that should be allocated. @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. @param[in] diameter (double): CF-entry diameter that used for CF-Tree construction, it might be increase if 'entry_size_limit' is exceeded. @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then the 'diameter' is increased and CF-Tree is rebuilt. @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when 'entry_size_limit' is exceeded. @param[in] ccore (bool): If True than C++ part of the library is used for processing. """ self.__pointer_data = data self.__number_clusters = number_clusters self.__measurement_type = type_measurement self.__entry_size_limit = entry_size_limit self.__diameter_multiplier = diameter_multiplier self.__ccore = ccore self.__verify_arguments() self.__features = None self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement) self.__clusters = [] self.__cf_clusters = []
def testGetNearestEntry(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1) tree = cftree(10, 100, 0.2, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) self.assertEqual(10, tree.branch_factor) self.assertEqual(100, tree.max_entries) self.assertEqual(0.2, tree.threshold) self.assertEqual(measurement_type.CENTROID_EUCLIDEAN_DISTANCE, tree.type_measurement) for index_point in range(len(sample)): tree.insert_point(sample[index_point]) cluster = [[0.1, 0.1], [0.2, 0.2]] entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) leaf = tree.find_nearest_leaf(entry) found_entry = leaf.get_nearest_entry(entry, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) found_index_entry = leaf.get_nearest_index_entry(entry, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) self.assertEqual(leaf.entries[found_index_entry], found_entry)
def __init__(self, data, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, ccore=True): """! @brief Constructor of clustering algorithm BIRCH. @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple. @param[in] number_clusters (uint): Number of clusters that should be allocated. @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt. @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded. @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem. @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation. """ self.__pointer_data = data self.__number_clusters = number_clusters self.__measurement_type = type_measurement self.__entry_size_limit = entry_size_limit self.__diameter_multiplier = diameter_multiplier self.__ccore = ccore self.__features = None self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement) self.__clusters = [] self.__noise = []
def templateCfTreeTotalNumberPoints(self, number_points, dimension, branching_factor, number_entries, diameter): tree = cftree(branching_factor, number_entries, diameter); for index_point in range(0, number_points): point = [ index_point for i in range(0, dimension) ]; tree.insert_cluster([ point ]); number_points = 0; for leaf in tree.leafes: number_points += leaf.feature.number_points; assert (index_point + 1) == number_points; number_leaf_points = 0; for leaf in tree.leafes: number_leaf_points += leaf.feature.number_points; assert number_points == tree.root.feature.number_points; if (number_points != number_leaf_points): print(number_points, number_leaf_points); assert number_points == number_leaf_points;