def test_entropy(self): """Check some entropies properties""" buffer = NumericRingBuffer(10000) self.assertEqual(buffer.shannon_entropy(), 0) buffer.append(10) self.assertEqual(buffer.shannon_entropy(), 0) import random for i in range(10000): buffer.append(random.randint(0,1000)) self.assertTrue(buffer.shannon_entropy(2) != 0) self.assertTrue(buffer.shannon_entropy(10) != 0) self.assertTrue(buffer.shannon_entropy(2) < math.log(1000, 2))
def test_probability(self): buffer = NumericRingBuffer(10) self.assertEqual(buffer.p_x(0), 0) buffer.append(42) self.assertEqual(buffer.p_x(42), 1) buffer.append(1234567890) self.assertEqual(buffer.p_x(42), 0.5)
def test_mean(self): buffer = NumericRingBuffer(10) self.assertEqual(buffer.mean(), 0) buffer.append(42) self.assertEqual(buffer.mean(), 42.0) buffer.append(5) self.assertEqual(buffer.mean(), 23.5)
def __init__(self, output_folder): """Constructor. 'Counter' is the number of values we have added so far.""" self.values = NumericRingBuffer(BUFFER_SIZE) self.nodes = [] self.counter = 0 self.output = open(output_folder + '/anomalies.dat', 'wb') self.orig = open(output_folder + '/original-serie.dat', 'wb')
def test_distribution(self): buffer = NumericRingBuffer(100) for i in range(100): buffer.append(100 - i) self.assertEqual(buffer.percentage(0), 1) self.assertEqual(buffer.percentage(90), 91) self.assertEqual(buffer.percentage(100), 100) self.assertEqual(buffer.percentage(101), 100)
def test_variance(self): buffer = NumericRingBuffer(10) self.assertEqual(buffer.variance(), 0)
def test_expected_value(self): buffer = NumericRingBuffer(10) self.assertEqual(buffer.expected_value(), 0) buffer.append(42) self.assertEqual(buffer.expected_value(), 42) buffer.append(21) self.assertEqual(buffer.expected_value(), 31.5) buffer.append(21) self.assertEqual(buffer.expected_value(), 28)
class HierarchicalClassifier(object): def __init__(self, output_folder): """Constructor. 'Counter' is the number of values we have added so far.""" self.values = NumericRingBuffer(BUFFER_SIZE) self.nodes = [] self.counter = 0 self.output = open(output_folder + '/anomalies.dat', 'wb') self.orig = open(output_folder + '/original-serie.dat', 'wb') def __del__(self): """Destructor. Properly close opened file descriptors.""" self.output.close() self.orig.close() def add(self, value): """Add a new value. We store the current number of the value in the map of metadata in 'n' key.""" self.values.append(value) self.counter += 1 if self.values.size > 1: self.orig.write(str(value) + '\n') vector = [ self.values.mean(), self.values.shannon_entropy(), self.values.variance(), self.values.expected_value(), ] metadata = { 'n': self.counter, 'v': value } self.nodes.append(ClusterNode(vec=vector, meta=metadata)) def build_set_rec(self, tree, marker): """Fill an array recursively from given tree.""" if not tree: return [] current = [] if tree.id > 0: current = [(tree.meta['n'], marker)] return current + self.build_set_rec(tree.left, marker) \ + self.build_set_rec(tree.right, marker) def build_sets(self, tree): """Build two classes from the given tree.""" return [] + self.build_set_rec(tree.left, 0) \ + self.build_set_rec(tree.right, 1) def find_anomalies(self): """Try to find anomalies according to what we have seen so far.""" tree = self.hcluster(self.nodes, squared_euclidian) sets = self.build_sets(tree) sets = sorted(sets, key = lambda elt: elt[0]) for elt in sets: self.output.write(str(int(elt[0])) + ' ' + str(elt[1]) + '\n') def hcluster(self, nodes, distance=euclidian): """Classif list of elements. Principle: each row start within it's individual cluster, then the matrix is processed to find closest rows until each row fits in a global hierarchical tree. Args: nodes: array of ClusterNode's distance: function computing distance between 2 vectors""" distances = {} # cache of (v, w) distances currentclustid = -1 # clusters are initially just the individual rows clust = [ClusterNode(vec=array(nodes[i].vec), id=i, meta=nodes[i].meta) \ for i in range(len(nodes))] while len(clust) > 1: print('%d remaining clusters' % len(clust)) lowestpair = (0, 1) closest = distance(clust[0].vec, clust[1].vec) # loop through every pair looking for the smallest distance # v_id and w_id are made local variable to avoid slow lookup # several times. The try/except statement is prefered as well # for performance issues (compared to `key not in distances`) for i in range(len(clust)): for j in range(i + 1, len(clust)): v_id = clust[i].id w_id = clust[j].id try: d = distances[(v_id, w_id)] except KeyError: distances[(v_id, w_id)] = \ distance(clust[i].vec, clust[j].vec) d = distances[(v_id, w_id)] if d < closest: closest = d lowestpair = (i, j) # calculate the average of the two clusters merged_vector = merge_vectors(clust[lowestpair[0]].vec, clust[lowestpair[1]].vec) # create the new cluster newcluster = ClusterNode(array(merged_vector), left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest, id=currentclustid) # cluster ids that weren't in the original set are negative currentclustid -= 1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster) return clust[0]