Exemple #1
0
def positive_negative_inconsistency(T, sus_dis_values):
    hist = {col:T[col].value_counts() for col in T.columns}
    for k, v in hist.items():
        col_hist = hist[k]
        num_positives = 0
        num_negatives = 0
        positives = []
        negatives = []
        number_of_diff_ele = 2
        for k2, v2 in col_hist.items():
            bool, val = isNumber(k2)
            if bool and val >= 0:
                num_positives = num_positives + 1
                if v2 > 1:
                    positives.append([k2,v2])
            elif bool and val < 0:
                num_negatives = num_negatives + 1
                if v2 > 1:
                    negatives.append([k2,v2])

        if (num_positives == 1) and (num_negatives > number_of_diff_ele):
            for k2, v2 in positives:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)
        if (num_negatives == 1) and (num_positives > number_of_diff_ele):
            for k2, v2 in negatives:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)
    return sus_dis_values
Exemple #2
0
def detect_single_char_strings(T, sus_dis_values):
    hist = {col:T[col].value_counts() for col in T.columns}
    for k, v in hist.items():
        col_hist = hist[k]
        L_Str = 0
        L_Nums = 0
        SusStr = []
        for k2, v2 in col_hist.items():
            bool, val = isNumber(k2)
            if bool:
                L_Nums = L_Nums + 1
            elif not isNull(k2):
                L_Str = L_Str + 1
                if v2 > 1:
                    SusStr.append([k2,v2])
            # if single nonaphnum character appears more than once
            if len(k2) == 1 and not k2.isalnum() and v2 > 1:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)

        if L_Str <= 2 and L_Str > 0 and L_Nums > 2:
            for k2, v2 in SusStr:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)
    return sus_dis_values
Exemple #3
0
def detect_outliers(T, sus_dis_values):
    # histogram
    hist = {col: T[col].value_counts() for col in T.columns}
    # for each column in the csv
    for k, v in hist.items():
        col_hist = hist[k]
        numeric_data = dict()
        L_Nums = 0
        for k2, v2 in col_hist.items():
            bool, val = isNumber(k2)
            if bool:
                L_Nums += 1
                numeric_data[val] = v2

        if len(col_hist) - L_Nums >= 3 or L_Nums < 10:
            continue
        mean, num_tuples, std = compute_statistical_quantities(numeric_data)
        sort_num = sorted(numeric_data)
        min_val = sort_num[0]
        max_val = sort_num[-1]
        min_dist = std
        max_score = 0.99

        for i in range(len(sort_num) - 1):
            min_dist = min(min_dist, abs(sort_num[i] - sort_num[i + 1]))
        h0 = compute_bandwidth(std, num_tuples)
        f_max = compute_max_pdf(numeric_data, min_val, max_val, h0, num_tuples)
        if min_dist <= h0:
            for k2, v2 in numeric_data.items():
                if v2 <= 1:
                    continue
                epdf = 0
                for kk in range(4):
                    h = h0 - (0.2 * kk * h0)
                    f_i = evaluate_pnt(numeric_data, k2, h, num_tuples)
                    if epdf < f_i:
                        epdf = f_i
                score = max(f_max - epdf, 0) / f_max
                if score > (1.0 - 1e-16):
                    if int(k2) == k2:
                        sus_dis = sus_disguised(k, str(int(k2)), score, v2,
                                                "OD")
                    else:
                        sus_dis = sus_disguised(k, str(k2), score, v2, "OD")
                    if sus_dis not in sus_dis_values:
                        sus_dis_values.append(sus_dis)
                    else:
                        common.add_detected_by_more_than_one_tool(
                            sus_dis_values, sus_dis)
    return sus_dis_values
Exemple #4
0
def find_disguised_values(T, sus_dis_values):
    # print(T)
    hist = {col: T[col].value_counts() for col in T.columns}
    # print(hist)
    KK = []
    # for each column in the csv
    for k, v in hist.items():
        if (len(hist[k]) >= 3):
            KK.append(k)
    if len(KK) == 1:
        return sus_dis_values
    Temp_T = T[KK].reset_index().values.tolist()
    for i in range(len(Temp_T)):
        Temp_T[i].pop(0)
    # print(Temp_T)
    Temp_hist = {col: T[col].value_counts() for col in KK}
    RandDMVD_Index_T = Table_Index_RandDMVD(Temp_T)
    for i in range(len(KK)):
        dis_value = None
        largest_DV = 0
        col_hist = hist[KK[i]]
        # print(col_hist)
        most_com = sorted(col_hist.items(), key=lambda kv: kv[1], reverse=True)
        # print(most_com)
        for k, v in most_com:
            # print(k,v)
            if v == 1:
                break
            k = k.lower()
            if k == 'null':
                continue
            corr, PT_num_rows = subtable_correlation(Temp_T, k, i,
                                                     RandDMVD_Index_T)
            DV_Score = len(Temp_T) / PT_num_rows * corr
            # print(corr)
            if DV_Score > largest_DV:
                dis_value = sus_disguised(KK[i], k, DV_Score, v, "Rand")
                largest_DV = DV_Score
        if dis_value is not None:
            ratio1 = dis_value.frequency / len(Temp_T)
            ratio2 = len(Temp_hist[KK[i]]) / len(Temp_T)
            # print(ratio1)
            # print(ratio2)
            # print(dis_value.value)
            if ratio1 > 0.01 and ratio2 > 0.01 and dis_value.frequency > 5:
                if dis_value not in sus_dis_values:
                    sus_dis_values.append(dis_value)
                else:
                    common.add_detected_by_more_than_one_tool(
                        sus_dis_values, dis_value)

    return sus_dis_values
Exemple #5
0
def find_all_patterns(T,sus_dis_values):
    # print(list(T))
    # histogram
    hist = {col:T[col].value_counts() for col in T.columns}
    # print(hist)
    min_num_ptrns = 5
    # for each column in the csv
    output_ptrns = dict()
    output_ptrns.clear()
    for k, v in hist.items():
        # print(k)
        # print(v)
        col_hist = hist[k]
        # ptrns_vec contains pattern as a directory
        # pttrns_hist contains pattern string : frequency
        pttrns, pttrns_hist = L1_patterns(col_hist)
        # print(pttrns)
        # print(pttrns_hist)
        AGG_Level = 1
        if len(pttrns_hist) > min_num_ptrns:
            AGG_Level = 2
            pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 2)
        if len(pttrns_hist) > min_num_ptrns:
            AGG_Level = 3
            pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 3)
        if len(pttrns_hist) > min_num_ptrns:
            AGG_Level = 4
            pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 4)
        if len(pttrns_hist) > min_num_ptrns:
            AGG_Level = 5
            pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 5)
        dominating_pttrns = determine_dominating_patterns(pttrns_hist)
        # print(dominating_pttrns)
        output_ptrns[k] = pttrns_hist
        sus_dis = []
        for k2, v2 in col_hist.items():
            # print(k2,v2)
            # common_Strings are strings have more than one frequency
            if v2 <= 1:
                continue
            test_ptrn = get_cell_pttrn(k2, AGG_Level)
            value = dominating_pttrns.get(test_ptrn,123)
            if value == 123:
                print("Pattern not found ..\n")
                sys.exit(1)
            elif not value:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)
    return sus_dis_values, output_ptrns
Exemple #6
0
def check_repeated_substrings(T, sus_dis_values):
    hist = {col:T[col].value_counts() for col in T.columns}
    for k, v in hist.items():
        col_hist = hist[k]
        threshold = 0.1
        num_rep_substr = 0
        repeated = []
        for k2, v2 in col_hist.items():
            std_dev = check_str_repetition(k2.lower())
            if std_dev == 0:
                num_rep_substr = num_rep_substr + 1
                if v2 > 1:
                    repeated.append([k2,v2])

        if num_rep_substr > 0 and num_rep_substr < threshold * len(col_hist):
            for k2, v2 in repeated:
                sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN")
                if sus_dis not in sus_dis_values:
                    sus_dis_values.append(sus_dis)


    return sus_dis_values