def templateDistanceCalculation(self, cluster1, cluster2, type_measurement): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)) entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)) # check that the same distance from 1 to 2 and from 2 to 1. distance12 = entry1.get_distance(entry2, type_measurement) distance21 = entry2.get_distance(entry1, type_measurement) assert distance12 == distance21; # check with utils calculation float_delta = 0.0000001 if (type_measurement == measurement_type.CENTROID_EUCLIDEAN_DISTANCE): assert distance12 == euclidean_distance_square(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.CENTROID_MANHATTAN_DISTANCE): assert distance12 == manhattan_distance(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_inter_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_intra_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.VARIANCE_INCREASE_DISTANCE): assert numpy.isclose(distance12, variance_increase_distance(cluster1, cluster2)) == True;
def templateDistanceCalculation(self, cluster1, cluster2, type_measurement): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); # check that the same distance from 1 to 2 and from 2 to 1. distance12 = entry1.get_distance(entry2, type_measurement); distance21 = entry2.get_distance(entry1, type_measurement); assert distance12 == distance21; # check with utils calculation float_delta = 0.0000001; if (type_measurement == measurement_type.CENTROID_EUCLIDIAN_DISTANCE): assert distance12 == euclidean_distance_sqrt(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.CENTROID_MANHATTAN_DISTANCE): assert distance12 == manhattan_distance(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_inter_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_intra_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.VARIANCE_INCREASE_DISTANCE): assert numpy.isclose(distance12, variance_increase_distance(cluster1, cluster2)) == True;
def templateCfEntryValueDistance(self, cluster1, cluster2, value, tolerance, type_measurment): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)) entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)) distance = entry1.get_distance(entry2, type_measurment) assert ((value - tolerance < distance) and (value + tolerance > distance))
def templateCfEntryDistance(self, type_measurement): cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]]; cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]]; cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]]; entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3)); distance12 = entry1.get_distance(entry2, type_measurement); distance23 = entry2.get_distance(entry3, type_measurement); distance13 = entry1.get_distance(entry3, type_measurement); assert distance12 < distance23; assert distance23 < distance13;
def templateCfEntryDistance(self, type_measurement): cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]] cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]] cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]] entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)) entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)) entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3)) distance12 = entry1.get_distance(entry2, type_measurement) distance23 = entry2.get_distance(entry3, type_measurement) distance13 = entry1.get_distance(entry3, type_measurement) assert distance12 < distance23; assert distance23 < distance13;
def __get_nearest_feature(self, point, feature_collection): """! @brief Find nearest entry for specified point. @param[in] point (list): Pointer to point from input dataset. @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point. @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry. """ minimum_distance = float("Inf") index_nearest_feature = -1 for index_entry in range(0, len(feature_collection)): point_entry = cfentry(1, linear_sum([point]), square_sum([point])) distance = feature_collection[index_entry].get_distance( point_entry, self.__measurement_type) if distance < minimum_distance: minimum_distance = distance index_nearest_feature = index_entry return minimum_distance, index_nearest_feature
def testCfTreeInserionOneLeafThreeEntries(self): cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]]; cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]]; cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]]; tree = cftree(3, 4, 0.0); tree.insert_cluster(cluster1); tree.insert_cluster(cluster2); tree.insert_cluster(cluster3); entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3)); assert tree.find_nearest_leaf(entry1) == tree.find_nearest_leaf(entry2); assert tree.find_nearest_leaf(entry2) == tree.find_nearest_leaf(entry3);
def insert_point(self, point): """! @brief Insert point that is represented by list of coordinates. @param[in] point (list): Point represented by list of coordinates that should be inserted to CF tree. """ entry = cfentry(len([point]), linear_sum([point]), square_sum([point])) self.insert(entry)
def testCfEntryIncrease(self): cluster = [[0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [0.4, 0.4], [0.6, 0.6]] entry1 = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) entry2 = entry1 + entry1 assert cfentry(10, [3.6, 3.6], 3.28) == entry2 entry2 = entry2 + entry2 assert cfentry(20, [7.2, 7.2], 6.56) == entry2
def testCfEntryIncrease(self): tolerance = 0.00001; cluster = [ [0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [0.4, 0.4], [0.6, 0.6] ]; entry1 = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)); entry2 = entry1 + entry1; assert cfentry(10, [3.6, 3.6], 3.28) == entry2; entry2 = entry2 + entry2; assert cfentry(20, [7.2, 7.2], 6.56) == entry2;
def insert_cluster(self, cluster): """! @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates. @details Clustering feature is created for that cluster and inserted to the tree. @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree. """ entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)); self.insert(entry);
def insert_cluster(self, cluster,hadoop_address_2): """! @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates. @details Clustering feature is created for that cluster and inserted to the tree. @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree. @len([cluster[0][:len(cluster[0])-1]] is the actual point after taking of the hadoop address from the data point @[cluster[0][len(cluster[0])-1]] is the hadoop address of a point """ entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster),hadoop_address_2); self.insert(entry);
def templateCfClusterRepresentation(self, cluster, centroid, radius, diameter, tolerance): entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)); assertion_centroid = centroid; if (type(centroid) != list): assertion_centroid = [ centroid ]; if (type(centroid) == list): for dimension in range(0, len(assertion_centroid)): assert (assertion_centroid[dimension] - tolerance < ( entry.get_centroid() )[dimension]) and (( entry.get_centroid() )[dimension] < assertion_centroid[dimension] + tolerance); assert (radius - tolerance < entry.get_radius()) and (entry.get_radius() < radius + tolerance); assert (diameter - tolerance < entry.get_diameter()) and (entry.get_diameter() < diameter + tolerance);
def templateCfClusterRepresentation(self, cluster, centroid, radius, diameter, tolerance): entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) assertion_centroid = centroid if type(centroid) != list: assertion_centroid = [centroid] if type(centroid) == list: for dimension in range(0, len(assertion_centroid)): self.assertAlmostEqual(assertion_centroid[dimension], (entry.get_centroid())[dimension], tolerance) self.assertAlmostEqual(radius, entry.get_radius(), tolerance) self.assertAlmostEqual(diameter, entry.get_diameter(), tolerance)
def __get_nearest_feature(self, point, feature_collection): minimum_distance = float("Inf") index_nearest_feature = -1 for index_entry in range(0, len(feature_collection)): point_entry = cfentry(1, linear_sum([point]), square_sum([point])) distance = feature_collection[index_entry].get_distance( point_entry, self.__measurement_type) if distance < minimum_distance: minimum_distance = distance index_nearest_feature = index_entry return minimum_distance, index_nearest_feature
def testCfTreeEntryAbsorbing(self): tree = cftree(2, 1, 10000.0); absorbing_entry = cfentry(0, [0.0, 0.0], 0.0); for offset in range(0, 10): cluster = [ [random() + offset, random() + offset] for i in range(10)]; entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)); absorbing_entry += entry; tree.insert(entry); assert 1 == tree.amount_entries; assert 1 == tree.amount_nodes; assert 1 == tree.height; assert None == tree.root.parent; assert absorbing_entry == tree.root.feature;
def testCfTreeEntryAbsorbing(self): tree = cftree(2, 1, 10000.0) absorbing_entry = cfentry(0, [0.0, 0.0], 0.0) for offset in range(0, 10): cluster = [[random() + offset, random() + offset] for i in range(10)] entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) absorbing_entry += entry tree.insert(entry) assert 1 == tree.amount_entries assert 1 == tree.amount_nodes assert 1 == tree.height assert None == tree.root.parent assert absorbing_entry == tree.root.feature
def testGetNearestEntry(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1) tree = cftree(10, 100, 0.2, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) self.assertEqual(10, tree.branch_factor) self.assertEqual(100, tree.max_entries) self.assertEqual(0.2, tree.threshold) self.assertEqual(measurement_type.CENTROID_EUCLIDEAN_DISTANCE, tree.type_measurement) for index_point in range(len(sample)): tree.insert_point(sample[index_point]) cluster = [[0.1, 0.1], [0.2, 0.2]] entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster)) leaf = tree.find_nearest_leaf(entry) found_entry = leaf.get_nearest_entry(entry, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) found_index_entry = leaf.get_nearest_index_entry(entry, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) self.assertEqual(leaf.entries[found_index_entry], found_entry)
def __get_nearest_feature(self, point): """! @brief Find nearest entry for specified point. @param[in] point (list): Pointer to point from input dataset. @return (uint) Index of nearest entry to the specified point. """ minimum_distance = float("Inf"); index_nearest_feature = -1; for index_entry in range(0, len(self.__features)): point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]),self.__hadoop_address); distance = self.__features[index_entry].get_distance(point_entry, self.__measurement_type); if (distance < minimum_distance): minimum_distance = distance; index_nearest_feature = index_entry; return index_nearest_feature;
def __get_nearest_feature(self, point): """! @brief Find nearest entry for specified point. @param[in] point (list): Pointer to point from input dataset. @return (uint) Index of nearest entry to the specified point. """ minimum_distance = float("Inf") index_nearest_feature = -1 for index_entry in range(0, len(self.__features)): point_entry = cfentry(1, linear_sum([point]), square_sum([point])) distance = self.__features[index_entry].get_distance( point_entry, self.__measurement_type) if (distance < minimum_distance): minimum_distance = distance index_nearest_feature = index_entry return index_nearest_feature
def __get_nearest_feature(self, point, feature_collection): """! @brief Find nearest entry for specified point. @param[in] point (list): Pointer to point from input dataset. @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point. @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry. """ minimum_distance = float("Inf"); index_nearest_feature = -1; for index_entry in range(0, len(feature_collection)): point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ])); distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type); if (distance < minimum_distance): minimum_distance = distance; index_nearest_feature = index_entry; return (minimum_distance, index_nearest_feature);
def templateCfEntryValueDistance(self, cluster1, cluster2, value, tolerance, type_measurment): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); distance = entry1.get_distance(entry2, type_measurment); assert ( (value - tolerance < distance) and (value + tolerance > distance) );