Example #1
0
def main_function(DATA_FILE):
    #DATA_FILE = "2004DF"
    if not os.path.exists('graph/' + DATA_FILE):
        os.makedirs('graph/' + DATA_FILE)
    start = time.time()
    start_getting_data = time.time()
    raw_dataframe = data_engine.getGCZDataFrame("2004DF")
    end_getting_data = time.time()
    print("Getting Data Time: {}".format(start_getting_data -
                                         end_getting_data))
    raw_dta = raw_dataframe.value.values

    # dao ham bac 1
    #der = cmfunc.change_after_k_seconds(raw_dta.value, k=1)
    # dao ham bac 2
    sec_der = cmfunc.change_after_k_seconds_with_abs(raw_dta, k=1)

    median_sec_der = np.median(sec_der)
    std_sec_der = np.std(sec_der)

    breakpoint_candidates = list(
        map(
            lambda x: (x[1] - median_sec_der) - np.abs(std_sec_der)
            if (x[1] - median_sec_der) - np.abs(std_sec_der) > 0 else 0,
            enumerate(sec_der)))
    breakpoint_candidates = (
        breakpoint_candidates - np.min(breakpoint_candidates)) / (
            np.max(breakpoint_candidates) - np.min(breakpoint_candidates))

    breakpoint_candidates = np.insert(breakpoint_candidates, 0, 0)

    from multiprocessing.pool import ThreadPool
    pool = ThreadPool(processes=4)
    final_f = []
    final_combination = []

    ############### To debug specific combination:############################
    final_index = 0
    alpha = 0.05
    print("Decay Value: %f" % alpha)
    new_data = raw_dataframe.copy()
    new_data = new_data.assign(
        anomaly_score=pd.Series(breakpoint_candidates).values)
    start_main_al = time.time()
    detect_final_result = engine.online_anomaly_detection(
        new_data, raw_dataframe, alpha, DATA_FILE)
    end_main_al = time.time()
    print("Execution time: {}".format(end_main_al - start_main_al))

    end = time.time()
    print("Total time: {}".format(end - start))
    return detect_final_result
def online_anomaly_detection(result_dta, raw_dta, alpha, DATA_FILE):

    # dao ham bac 2
    sec_der = cmfunc.change_after_k_seconds_with_abs(raw_dta.value, k=1)

    median_sec_der = np.median(sec_der)
    std_sec_der = np.std(sec_der)

    breakpoint_candidates = list(map(lambda x: (x[1] - median_sec_der) - np.abs(std_sec_der) if (x[1] - median_sec_der) - np.abs(std_sec_der) > 0 else 0,
        enumerate(sec_der)))
    breakpoint_candidates = (breakpoint_candidates - np.min(breakpoint_candidates)) / (np.max(breakpoint_candidates) - np.min(breakpoint_candidates))

    breakpoint_candidates = np.insert(breakpoint_candidates, 0, 0)

    dta_full = result_dta

    dta_full.value.index = result_dta.timestamp

    std_anomaly_set = np.std(result_dta['anomaly_score'])
    np.argsort(result_dta['anomaly_score'])

    # Get 5% anomaly point
    # anomaly_index =
    # np.array(np.argsort(result_dta['anomaly_score']))[-five_percentage:]
    anomaly_index = np.array([i for i, value in enumerate(result_dta['anomaly_score']) if value > 3 * std_anomaly_set])

    limit_size = int(1 / alpha)
    # Y is the anomaly spreding and Z is the normal spreading.
    Y = np.zeros(len(result_dta['anomaly_score']))
    Z = np.zeros(len(result_dta['anomaly_score']))
    X = list(map(lambda x: [x, result_dta.values[x][1]], np.arange(len(result_dta.values))))
    # dt=DistanceMetric.get_metric('pyfunc',func=mydist)
    tree = nb.KDTree(X, leaf_size=50)
    potential_anomaly = []

    start_time_calculate_Y = time.time()
    # Calculate Y
    tasks = []
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    for anomaly_point in anomaly_index:
        #calculate_Y_value(alpha, anomaly_point, limit_size, median_sec_der, potential_anomaly, raw_dta, result_dta, std_sec_der, tree, X, Y)
        tasks.append(asyncio.ensure_future(calculate_Y_value(alpha, anomaly_point, limit_size, median_sec_der, potential_anomaly, raw_dta, result_dta, std_sec_der, tree, X, Y)))

    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()
    backup_draw = result_dta.copy()

    # Calculate final score
    result_dta.anomaly_score = result_dta.anomaly_score + Y

    end_time_calculate_Y = time.time()
    print("Calculating Y Time: {}".format(start_time_calculate_Y - end_time_calculate_Y))

    start_time_calculate_Z = time.time()
    # Find normal point
    # normal_index =
    # np.array(np.argsort(result_dta['anomaly_score']))[:int((0.4 *
    # len(result_dta['anomaly_score'])))]
    normal_index = [i for i, value in enumerate(result_dta['anomaly_score']) if
                    value <= np.percentile(result_dta['anomaly_score'], 20)]

    normal_index = np.random.choice(normal_index, int(len(normal_index) * 0.2), replace=False)

    # Calculate Z
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    tasks = []
    for normal_point in normal_index:
        tasks.append(asyncio.ensure_future(calculate_z_value(alpha, limit_size, normal_point, result_dta, Z)))
    loop.run_until_complete(asyncio.wait(tasks))  
        

    result_dta.anomaly_score = result_dta.anomaly_score - Z

    end_time_calculate_Z = time.time()
    print("Calculating Z Time: {}".format(start_time_calculate_Z - end_time_calculate_Z))

    final_score = list(map(lambda x: 0 if x < 0 else x, result_dta.anomaly_score))
    final_score = (final_score - np.min(final_score)) / (np.max(final_score) - np.min(final_score))

    # Calculating Change point.
    start_time_calculate_changepoint = time.time()

    ### Find potential anomaly point
    std_final_point = np.std(final_score)
    # anomaly_set = [i for i, v in enumerate(final_score) if v > 3 *
    # std_final_point]
    anomaly_set = [i for i, v in enumerate(final_score) if v > 0]

    # The algorithm to seperate anomaly point and change point.
    X = list(map(lambda x: [x, x], np.arange(len(result_dta.values))))
    newX = list(np.array(X)[anomaly_set])
    newtree = nb.KDTree(X, leaf_size=50)

    anomaly_group_set = []
    new_small_x = 0
    sliding_index = 1
    for index_value, new_small_x in enumerate(anomaly_set):
        anomaly_neighboor = np.array(cmfunc.find_inverneghboor_of_point_1(newtree, X, new_small_x, anomaly_set, limit_size),
            dtype=np.int32)
        tmp_array = list(map(lambda x: x[1], anomaly_neighboor))
        if index_value > 0:
            common_array = list(set(tmp_array).intersection(anomaly_group_set[index_value - sliding_index]))
            # anomaly_group_set = np.concatenate((anomaly_group_set,
            # tmp_array))
            if len(common_array) != 0:
                union_array = list(set(tmp_array).union(anomaly_group_set[index_value - sliding_index]))
                anomaly_group_set[index_value - sliding_index] = np.append(anomaly_group_set[index_value - sliding_index],
                    list(set(tmp_array).difference(anomaly_group_set[index_value - sliding_index])))
                sliding_index = sliding_index + 1
            else:
                anomaly_group_set.append(np.sort(tmp_array))
        else:
            anomaly_group_set.append(np.sort(tmp_array))

    new_array = [tuple(row) for row in anomaly_group_set]
    uniques = new_array
    std_example_data = []
    std_example_outer = []
    detect_final_result = [[], []]
    for detect_pattern in uniques:
        # rest_anomaly_set = [i for i in anomaly_set if i not in
        # list(detect_pattern)]
        list_of_anomaly = [int(j) for i in anomaly_group_set for j in i]
        example_data = [i for i in (list(raw_dta.value.values[list(z for z in range(int(min(detect_pattern) - 3), int(min(detect_pattern))) if
                                           z not in list_of_anomaly)]) + list(raw_dta.value.values[list(z for z in range(int(max(detect_pattern) + 1), int(max(detect_pattern) + 4)) if
                    z not in list_of_anomaly and z < len(raw_dta.value.values))]))]

        in_std_with_Anomaly = np.std(example_data + list(raw_dta.value.values[int(min(detect_pattern)): int(max(detect_pattern) + 1)]))
        std_example_data.append(in_std_with_Anomaly)

        example_data_iner = list(raw_dta.value.values[int(min(detect_pattern)): int(max(detect_pattern)) + 1])

        in_std_with_NonAnomaly = np.std(example_data)
        if (in_std_with_Anomaly > 1.5 * in_std_with_NonAnomaly):
            detect_final_result[1].extend(np.array(detect_pattern, dtype=np.int))
        else:
            detect_final_result[0].append(int(np.min(detect_pattern)))
        std_example_outer.append(in_std_with_NonAnomaly)
    final_changepoint_set = detect_final_result[0]
    data_file = DATA_FILE

    end_time_calculate_changepoint = time.time()
    print("Calculating Change Point Time: {}".format(start_time_calculate_changepoint - end_time_calculate_changepoint))
    chartmess = cmfunc.plot_data_all(DATA_FILE,
                         [[list(range(0, len(raw_dta.value))), raw_dta.value],
                          [detect_final_result[0], raw_dta.value[detect_final_result[0]]],
                          [detect_final_result[1], raw_dta.value[detect_final_result[1]]]],
                         ['lines', 'markers', 'markers'], [None, 'circle', 'circle', 'x', 'x'],
                         ['Raw data', "Detected Change Point",
                          "Detected Anomaly Point"])

    return [detect_final_result,chartmess]