def main_function(DATA_FILE): #DATA_FILE = "2004DF" if not os.path.exists('graph/' + DATA_FILE): os.makedirs('graph/' + DATA_FILE) start = time.time() start_getting_data = time.time() raw_dataframe = data_engine.getGCZDataFrame("2004DF") end_getting_data = time.time() print("Getting Data Time: {}".format(start_getting_data - end_getting_data)) raw_dta = raw_dataframe.value.values # dao ham bac 1 #der = cmfunc.change_after_k_seconds(raw_dta.value, k=1) # dao ham bac 2 sec_der = cmfunc.change_after_k_seconds_with_abs(raw_dta, k=1) median_sec_der = np.median(sec_der) std_sec_der = np.std(sec_der) breakpoint_candidates = list( map( lambda x: (x[1] - median_sec_der) - np.abs(std_sec_der) if (x[1] - median_sec_der) - np.abs(std_sec_der) > 0 else 0, enumerate(sec_der))) breakpoint_candidates = ( breakpoint_candidates - np.min(breakpoint_candidates)) / ( np.max(breakpoint_candidates) - np.min(breakpoint_candidates)) breakpoint_candidates = np.insert(breakpoint_candidates, 0, 0) from multiprocessing.pool import ThreadPool pool = ThreadPool(processes=4) final_f = [] final_combination = [] ############### To debug specific combination:############################ final_index = 0 alpha = 0.05 print("Decay Value: %f" % alpha) new_data = raw_dataframe.copy() new_data = new_data.assign( anomaly_score=pd.Series(breakpoint_candidates).values) start_main_al = time.time() detect_final_result = engine.online_anomaly_detection( new_data, raw_dataframe, alpha, DATA_FILE) end_main_al = time.time() print("Execution time: {}".format(end_main_al - start_main_al)) end = time.time() print("Total time: {}".format(end - start)) return detect_final_result
def online_anomaly_detection(result_dta, raw_dta, alpha, DATA_FILE): # dao ham bac 2 sec_der = cmfunc.change_after_k_seconds_with_abs(raw_dta.value, k=1) median_sec_der = np.median(sec_der) std_sec_der = np.std(sec_der) breakpoint_candidates = list(map(lambda x: (x[1] - median_sec_der) - np.abs(std_sec_der) if (x[1] - median_sec_der) - np.abs(std_sec_der) > 0 else 0, enumerate(sec_der))) breakpoint_candidates = (breakpoint_candidates - np.min(breakpoint_candidates)) / (np.max(breakpoint_candidates) - np.min(breakpoint_candidates)) breakpoint_candidates = np.insert(breakpoint_candidates, 0, 0) dta_full = result_dta dta_full.value.index = result_dta.timestamp std_anomaly_set = np.std(result_dta['anomaly_score']) np.argsort(result_dta['anomaly_score']) # Get 5% anomaly point # anomaly_index = # np.array(np.argsort(result_dta['anomaly_score']))[-five_percentage:] anomaly_index = np.array([i for i, value in enumerate(result_dta['anomaly_score']) if value > 3 * std_anomaly_set]) limit_size = int(1 / alpha) # Y is the anomaly spreding and Z is the normal spreading. Y = np.zeros(len(result_dta['anomaly_score'])) Z = np.zeros(len(result_dta['anomaly_score'])) X = list(map(lambda x: [x, result_dta.values[x][1]], np.arange(len(result_dta.values)))) # dt=DistanceMetric.get_metric('pyfunc',func=mydist) tree = nb.KDTree(X, leaf_size=50) potential_anomaly = [] start_time_calculate_Y = time.time() # Calculate Y tasks = [] loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) for anomaly_point in anomaly_index: #calculate_Y_value(alpha, anomaly_point, limit_size, median_sec_der, potential_anomaly, raw_dta, result_dta, std_sec_der, tree, X, Y) tasks.append(asyncio.ensure_future(calculate_Y_value(alpha, anomaly_point, limit_size, median_sec_der, potential_anomaly, raw_dta, result_dta, std_sec_der, tree, X, Y))) loop.run_until_complete(asyncio.wait(tasks)) loop.close() backup_draw = result_dta.copy() # Calculate final score result_dta.anomaly_score = result_dta.anomaly_score + Y end_time_calculate_Y = time.time() print("Calculating Y Time: {}".format(start_time_calculate_Y - end_time_calculate_Y)) start_time_calculate_Z = time.time() # Find normal point # normal_index = # np.array(np.argsort(result_dta['anomaly_score']))[:int((0.4 * # len(result_dta['anomaly_score'])))] normal_index = [i for i, value in enumerate(result_dta['anomaly_score']) if value <= np.percentile(result_dta['anomaly_score'], 20)] normal_index = np.random.choice(normal_index, int(len(normal_index) * 0.2), replace=False) # Calculate Z loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [] for normal_point in normal_index: tasks.append(asyncio.ensure_future(calculate_z_value(alpha, limit_size, normal_point, result_dta, Z))) loop.run_until_complete(asyncio.wait(tasks)) result_dta.anomaly_score = result_dta.anomaly_score - Z end_time_calculate_Z = time.time() print("Calculating Z Time: {}".format(start_time_calculate_Z - end_time_calculate_Z)) final_score = list(map(lambda x: 0 if x < 0 else x, result_dta.anomaly_score)) final_score = (final_score - np.min(final_score)) / (np.max(final_score) - np.min(final_score)) # Calculating Change point. start_time_calculate_changepoint = time.time() ### Find potential anomaly point std_final_point = np.std(final_score) # anomaly_set = [i for i, v in enumerate(final_score) if v > 3 * # std_final_point] anomaly_set = [i for i, v in enumerate(final_score) if v > 0] # The algorithm to seperate anomaly point and change point. X = list(map(lambda x: [x, x], np.arange(len(result_dta.values)))) newX = list(np.array(X)[anomaly_set]) newtree = nb.KDTree(X, leaf_size=50) anomaly_group_set = [] new_small_x = 0 sliding_index = 1 for index_value, new_small_x in enumerate(anomaly_set): anomaly_neighboor = np.array(cmfunc.find_inverneghboor_of_point_1(newtree, X, new_small_x, anomaly_set, limit_size), dtype=np.int32) tmp_array = list(map(lambda x: x[1], anomaly_neighboor)) if index_value > 0: common_array = list(set(tmp_array).intersection(anomaly_group_set[index_value - sliding_index])) # anomaly_group_set = np.concatenate((anomaly_group_set, # tmp_array)) if len(common_array) != 0: union_array = list(set(tmp_array).union(anomaly_group_set[index_value - sliding_index])) anomaly_group_set[index_value - sliding_index] = np.append(anomaly_group_set[index_value - sliding_index], list(set(tmp_array).difference(anomaly_group_set[index_value - sliding_index]))) sliding_index = sliding_index + 1 else: anomaly_group_set.append(np.sort(tmp_array)) else: anomaly_group_set.append(np.sort(tmp_array)) new_array = [tuple(row) for row in anomaly_group_set] uniques = new_array std_example_data = [] std_example_outer = [] detect_final_result = [[], []] for detect_pattern in uniques: # rest_anomaly_set = [i for i in anomaly_set if i not in # list(detect_pattern)] list_of_anomaly = [int(j) for i in anomaly_group_set for j in i] example_data = [i for i in (list(raw_dta.value.values[list(z for z in range(int(min(detect_pattern) - 3), int(min(detect_pattern))) if z not in list_of_anomaly)]) + list(raw_dta.value.values[list(z for z in range(int(max(detect_pattern) + 1), int(max(detect_pattern) + 4)) if z not in list_of_anomaly and z < len(raw_dta.value.values))]))] in_std_with_Anomaly = np.std(example_data + list(raw_dta.value.values[int(min(detect_pattern)): int(max(detect_pattern) + 1)])) std_example_data.append(in_std_with_Anomaly) example_data_iner = list(raw_dta.value.values[int(min(detect_pattern)): int(max(detect_pattern)) + 1]) in_std_with_NonAnomaly = np.std(example_data) if (in_std_with_Anomaly > 1.5 * in_std_with_NonAnomaly): detect_final_result[1].extend(np.array(detect_pattern, dtype=np.int)) else: detect_final_result[0].append(int(np.min(detect_pattern))) std_example_outer.append(in_std_with_NonAnomaly) final_changepoint_set = detect_final_result[0] data_file = DATA_FILE end_time_calculate_changepoint = time.time() print("Calculating Change Point Time: {}".format(start_time_calculate_changepoint - end_time_calculate_changepoint)) chartmess = cmfunc.plot_data_all(DATA_FILE, [[list(range(0, len(raw_dta.value))), raw_dta.value], [detect_final_result[0], raw_dta.value[detect_final_result[0]]], [detect_final_result[1], raw_dta.value[detect_final_result[1]]]], ['lines', 'markers', 'markers'], [None, 'circle', 'circle', 'x', 'x'], ['Raw data', "Detected Change Point", "Detected Anomaly Point"]) return [detect_final_result,chartmess]