def positive_negative_inconsistency(T, sus_dis_values): hist = {col:T[col].value_counts() for col in T.columns} for k, v in hist.items(): col_hist = hist[k] num_positives = 0 num_negatives = 0 positives = [] negatives = [] number_of_diff_ele = 2 for k2, v2 in col_hist.items(): bool, val = isNumber(k2) if bool and val >= 0: num_positives = num_positives + 1 if v2 > 1: positives.append([k2,v2]) elif bool and val < 0: num_negatives = num_negatives + 1 if v2 > 1: negatives.append([k2,v2]) if (num_positives == 1) and (num_negatives > number_of_diff_ele): for k2, v2 in positives: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) if (num_negatives == 1) and (num_positives > number_of_diff_ele): for k2, v2 in negatives: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) return sus_dis_values
def detect_single_char_strings(T, sus_dis_values): hist = {col:T[col].value_counts() for col in T.columns} for k, v in hist.items(): col_hist = hist[k] L_Str = 0 L_Nums = 0 SusStr = [] for k2, v2 in col_hist.items(): bool, val = isNumber(k2) if bool: L_Nums = L_Nums + 1 elif not isNull(k2): L_Str = L_Str + 1 if v2 > 1: SusStr.append([k2,v2]) # if single nonaphnum character appears more than once if len(k2) == 1 and not k2.isalnum() and v2 > 1: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) if L_Str <= 2 and L_Str > 0 and L_Nums > 2: for k2, v2 in SusStr: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) return sus_dis_values
def detect_outliers(T, sus_dis_values): # histogram hist = {col: T[col].value_counts() for col in T.columns} # for each column in the csv for k, v in hist.items(): col_hist = hist[k] numeric_data = dict() L_Nums = 0 for k2, v2 in col_hist.items(): bool, val = isNumber(k2) if bool: L_Nums += 1 numeric_data[val] = v2 if len(col_hist) - L_Nums >= 3 or L_Nums < 10: continue mean, num_tuples, std = compute_statistical_quantities(numeric_data) sort_num = sorted(numeric_data) min_val = sort_num[0] max_val = sort_num[-1] min_dist = std max_score = 0.99 for i in range(len(sort_num) - 1): min_dist = min(min_dist, abs(sort_num[i] - sort_num[i + 1])) h0 = compute_bandwidth(std, num_tuples) f_max = compute_max_pdf(numeric_data, min_val, max_val, h0, num_tuples) if min_dist <= h0: for k2, v2 in numeric_data.items(): if v2 <= 1: continue epdf = 0 for kk in range(4): h = h0 - (0.2 * kk * h0) f_i = evaluate_pnt(numeric_data, k2, h, num_tuples) if epdf < f_i: epdf = f_i score = max(f_max - epdf, 0) / f_max if score > (1.0 - 1e-16): if int(k2) == k2: sus_dis = sus_disguised(k, str(int(k2)), score, v2, "OD") else: sus_dis = sus_disguised(k, str(k2), score, v2, "OD") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) else: common.add_detected_by_more_than_one_tool( sus_dis_values, sus_dis) return sus_dis_values
def find_disguised_values(T, sus_dis_values): # print(T) hist = {col: T[col].value_counts() for col in T.columns} # print(hist) KK = [] # for each column in the csv for k, v in hist.items(): if (len(hist[k]) >= 3): KK.append(k) if len(KK) == 1: return sus_dis_values Temp_T = T[KK].reset_index().values.tolist() for i in range(len(Temp_T)): Temp_T[i].pop(0) # print(Temp_T) Temp_hist = {col: T[col].value_counts() for col in KK} RandDMVD_Index_T = Table_Index_RandDMVD(Temp_T) for i in range(len(KK)): dis_value = None largest_DV = 0 col_hist = hist[KK[i]] # print(col_hist) most_com = sorted(col_hist.items(), key=lambda kv: kv[1], reverse=True) # print(most_com) for k, v in most_com: # print(k,v) if v == 1: break k = k.lower() if k == 'null': continue corr, PT_num_rows = subtable_correlation(Temp_T, k, i, RandDMVD_Index_T) DV_Score = len(Temp_T) / PT_num_rows * corr # print(corr) if DV_Score > largest_DV: dis_value = sus_disguised(KK[i], k, DV_Score, v, "Rand") largest_DV = DV_Score if dis_value is not None: ratio1 = dis_value.frequency / len(Temp_T) ratio2 = len(Temp_hist[KK[i]]) / len(Temp_T) # print(ratio1) # print(ratio2) # print(dis_value.value) if ratio1 > 0.01 and ratio2 > 0.01 and dis_value.frequency > 5: if dis_value not in sus_dis_values: sus_dis_values.append(dis_value) else: common.add_detected_by_more_than_one_tool( sus_dis_values, dis_value) return sus_dis_values
def find_all_patterns(T,sus_dis_values): # print(list(T)) # histogram hist = {col:T[col].value_counts() for col in T.columns} # print(hist) min_num_ptrns = 5 # for each column in the csv output_ptrns = dict() output_ptrns.clear() for k, v in hist.items(): # print(k) # print(v) col_hist = hist[k] # ptrns_vec contains pattern as a directory # pttrns_hist contains pattern string : frequency pttrns, pttrns_hist = L1_patterns(col_hist) # print(pttrns) # print(pttrns_hist) AGG_Level = 1 if len(pttrns_hist) > min_num_ptrns: AGG_Level = 2 pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 2) if len(pttrns_hist) > min_num_ptrns: AGG_Level = 3 pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 3) if len(pttrns_hist) > min_num_ptrns: AGG_Level = 4 pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 4) if len(pttrns_hist) > min_num_ptrns: AGG_Level = 5 pttrns, pttrns_hist = L_patterns(pttrns, pttrns_hist, 5) dominating_pttrns = determine_dominating_patterns(pttrns_hist) # print(dominating_pttrns) output_ptrns[k] = pttrns_hist sus_dis = [] for k2, v2 in col_hist.items(): # print(k2,v2) # common_Strings are strings have more than one frequency if v2 <= 1: continue test_ptrn = get_cell_pttrn(k2, AGG_Level) value = dominating_pttrns.get(test_ptrn,123) if value == 123: print("Pattern not found ..\n") sys.exit(1) elif not value: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) return sus_dis_values, output_ptrns
def check_repeated_substrings(T, sus_dis_values): hist = {col:T[col].value_counts() for col in T.columns} for k, v in hist.items(): col_hist = hist[k] threshold = 0.1 num_rep_substr = 0 repeated = [] for k2, v2 in col_hist.items(): std_dev = check_str_repetition(k2.lower()) if std_dev == 0: num_rep_substr = num_rep_substr + 1 if v2 > 1: repeated.append([k2,v2]) if num_rep_substr > 0 and num_rep_substr < threshold * len(col_hist): for k2, v2 in repeated: sus_dis = sus_disguised(k, k2, 1.0, v2, "SYN") if sus_dis not in sus_dis_values: sus_dis_values.append(sus_dis) return sus_dis_values