def mahalanobis_distance(self, point): if (self.stale): self.update_statistics() centroid, variance = self.centroid, self.variance distance = 0 for i in range(1, self.D): distance = distance + ((point[i] - centroid[i]) / variance[i])**2 distance = math.sqrt(distance) return distance
def merge_CS(self): alpha = 1 # self.alpha if len(self.CS_set) == 0: return dimension = self.CS_set[0].D exist = [True] * len(self.CS_set) THRESHOLD = alpha * math.sqrt(dimension) # a small alpha here index_combination = list( itertools.combinations(list(range(len(self.CS_set))), 2)) index_combination_distance = [] cs_batch_process_merge = {} for i in range(len(self.CS_set)): self.CS_set[i].update_statistics() for (s, t) in index_combination: s_centroid = self.CS_set[s].centroid distance = self.CS_set[t].mahalanobis_distance(s_centroid) index_combination_distance.append((s, t, distance)) index_combination_distance = sorted(index_combination_distance, key=lambda x: (x[2], x[0], x[1])) for (s, t, dis) in index_combination_distance: if dis > THRESHOLD: break if exist[s] == False: continue if dis < THRESHOLD: exist[s] = False if t not in cs_batch_process_merge: cs_batch_process_merge[t] = [] cs_batch_process_merge[t].append(s) for i in range(len(self.CS_set)): if i not in cs_batch_process_merge: continue for j in range(len(self.CS_set)): if j in cs_batch_process_merge and i in cs_batch_process_merge[ j]: cs_batch_process_merge[j] += cs_batch_process_merge[i] del cs_batch_process_merge[i] remove_index_list = set() for dest in cs_batch_process_merge: for source in cs_batch_process_merge[dest]: self.CS_set[dest].merge_Cluster(self.CS_set[source]) remove_index_list.add(source) # print("[Before]==> The number of CS is : " + str(len(self.CS_set))) # print(cs_batch_process_merge) tmp = copy.deepcopy(self.CS_set) length = len(self.CS_set) del self.CS_set self.CS_set = [] for i in range(length): if i not in remove_index_list: self.CS_set.append(tmp[i]) del tmp
def update_statistics(self): centroid = [0] * self.D variance = [0] * self.D for i in range(1, self.D): centroid[i] = self.SUM[i] / self.N # variance[i] = (self.SUMSQ[i] / self.N) - math.pow((self.SUM[i] / self.N),2) # variance[i] = math.sqrt(variance[i]) variance[i] = math.sqrt((self.SUMSQ[i] / self.N) - (self.SUM[i] / self.N)**2) self.stale = False self.centroid, self.variance = centroid, variance
def find_nearest_DS(DS_set, alpha, point): dimension = DS_set[0].D - 1 THRESHOLD = alpha * math.sqrt(dimension) distance = float("inf") _label = -1 for label in DS_set: dist = DS_set[label].mahalanobis_distance(point) if dist < distance: distance = dist if dist < THRESHOLD: _label = label return _label
def find_nearest_CS(CS_set, alpha, point): # CS_set is a list if len(CS_set) == 0: return -1 dimension = CS_set[0].D - 1 THRESHOLD = alpha * math.sqrt(dimension) distance = float("inf") _index = -1 for index in range(len(CS_set)): dist = CS_set[index].mahalanobis_distance(point) if dist < distance and dist < THRESHOLD: distance = dist _index = index return _index
def euclidean_distance(self, a, b): a = a[1:] b = b[1:] c = [pow(i - j, 2) for i, j in zip(a, b)] return math.sqrt(sum(c))