def lcs2(string1, string2, debug=False): string = string1 + "$" + string2 + "$" tree = STree() tree.build_with_automatic_end([string1, string2]) if debug: print(tree) def dfs(state: State): if state.right == float('inf'): return '', state.left >= len(string) - len(string2) - 1 res, has_string1, has_string2 = [], False, False for s in state.transition.values(): res_string, is_string2 = dfs(s) if len(res_string) > 0: res.append((string[state.left:state.right + 1] if state.left != -1 else "") + res_string) else: if is_string2: has_string2 = True else: has_string1 = True if has_string1 and has_string2: res.append(string[state.left:state.right + 1]) return max(res, key=lambda x: len(x)) if res else '', is_string2 return dfs(tree.root)[0]
def lcs(strings, debug=False): string = '$'.join(strings) + '$' str_lens = list(accumulate(strings, lambda x, y: x + len(y) + 1, initial=0)) tree = STree() tree.build_with_automatic_end(strings) if debug: print(tree) def dfs(state: State): if state.right == float('inf'): return '', { next(i for i in range(len(str_lens) - 1) if state.left < str_lens[i + 1]) } res, string_set = [], set() for s in state.transition.values(): res_string, string_idxes = dfs(s) if len(res_string) > 0: res.append((string[state.left:state.right + 1] if state.left != -1 else "") + res_string) else: string_set.update(string_idxes) if len(string_set) == len(strings): res.append(string[state.left:state.right + 1]) return max(res, key=lambda x: len(x)) if res else '', string_set return dfs(tree.root)[0]
def iter_detect(logdir, cfg, df_gpu, time_interval, threshold, iteration_times): global iteration_timelines, iteration_table, blank_count, \ iteration_table df_gpu.to_csv('dfgpu.csv') t_df_begin = df_gpu.iloc[0]['timestamp'] t_df_end = df_gpu.iloc[-1]['timestamp'] candidate_patterns=[] (main_string,name_table) = main_string_generate_v1(df_gpu) #main_string = "0,1,1,1,1,1,0,2,3,2,3,2,3" #main_string = "49,49,49,49,49,49,1,1,2,2,1,2,1,2" #print('main_string: '+main_string) st = STree(main_string) st.find_repeat_pattern(candidate_patterns, iteration_times) candidate_patterns.sort(key = lambda s: len(s), reverse=True) filtered_candidate_patterns = pattern_filter(candidate_patterns) print('There are %d candidate patterns for %d-iteration'%(len(candidate_patterns),cfg.num_iterations)) pattern_pre = "" pat_seq = [] for pattern in filtered_candidate_patterns: #print('A: ',pattern_pre) #print('B: ',pattern) pp_ratio = fuzz.ratio(pattern,pattern_pre) if pp_ratio > 80: #print("pattern too similar: ",pp_ratio) continue else: pattern_pre = pattern #print('original string length of main_string = %d' % len(main_string)) wid_seq = main_string.split(',') pat_seq = pattern.split(',') total_length = num_wids = len(wid_seq) #print('runtime string length = %d' % num_wids ) #print('pattern length (block_size) = %d' % len(pat_seq)) block_size = len(pat_seq) matched_block_end_pre = 0 block_begin = 0 block_end = 0 block_end = block_size step = 1 iteration_count = 0 b_overlap = False fw_threshold = 80 ind = [] while block_begin <= (total_length - block_size): blockString = ",".join(wid_seq[block_begin:block_end]) fuzz_ratio = fuzz.ratio(blockString,pattern) if fuzz_ratio >= fw_threshold: ind.append(block_begin) block_begin = block_end block_end = block_begin + block_size else: block_begin = block_begin + step block_end = block_begin + block_size #print('Non-overlapped %d-times pattern: %s'%(len(ind),pattern)) if len(ind) == cfg.num_iterations: for i in ind: iteration_table.append((df_gpu.iloc[i]['timestamp'],df_gpu.iloc[i+block_size-1]['timestamp'])) break else: #print_warning("No matched strings by fuzzywuzzy of threshold %d."%fw_threshold) continue print("Selected pattern:") print(pat_seq) print("End of AISI")
def iter_detect(logdir, cfg, df_gpu, time_interval, threshold, iteration_times): global iteration_timelines, iteration_table, blank_count, \ iteration_table df_gpu.to_csv('dfgpu.csv') t_df_begin = df_gpu.iloc[0]['timestamp'] t_df_end = df_gpu.iloc[-1]['timestamp'] candidate_pattern = [] main_string = main_string_generate_v1(df_gpu) st = STree(main_string) st.find_repeat_pattern(candidate_pattern, iteration_times) #print("main_string:", main_string) pattern = select_pattern(candidate_pattern) if pattern: print('original string length of main_string = %d' % len(main_string)) wid_seq = main_string.split(',') pat_seq = pattern.split(',') total_length = num_wids = len(wid_seq) print('context string length = %d' % num_wids) print('pattern length = %d' % len(pat_seq)) #block_size = pattern.count(',') + 1 print(pat_seq) block_size = len(pat_seq) block_begin = 0 block_end = block_size step = 1 #step = int(block_size/8) iteration_count = 0 fuzzyRatioTable = [] while block_begin <= (total_length - block_size): blockString = ",".join(wid_seq[block_begin:block_end]) #print('A(%d): %s' % (len(blockString),blockString)) #print('B(%d): %s' % (len(pattern), pattern)) fuzz_ratio = fuzz.ratio(blockString, pattern) fuzzyRatioTable.append((block_begin, fuzz_ratio)) #print(fuzz_ratio) block_begin = block_begin + step block_end = block_begin + block_size #find largest fuzzy ratio n blocks (n = iteration_times) ind = [] begTable = [] endTable = [] beg = 0 end = 0 fw_threshold = 80 for i in range(len(fuzzyRatioTable)): if fuzzyRatioTable[i][1] > fw_threshold: ind.append(fuzzyRatioTable[i][0]) if len(ind) > 0: #plot event by event for i in ind: iteration_table.append( (df_gpu.iloc[i]['timestamp'], df_gpu.iloc[i + block_size - 1]['timestamp'])) begTable.append(ind[0]) beg = ind[0] end = ind[0] + block_size #plot window by window #for index in ind: # iteration_table.append((float(index*step) * time_interval + t_df_begin, float(index*step + block_size) * time_interval + t_df_begin)) #begTable.append(ind[0]) #beg = ind[0] #end = ind[0] + block_size else: print_warning("No matched strings by fuzzywuzzy of threshold %d." % fw_threshold) else: print('No iteration patterns detected.')
def iterationDetection(logdir, cfg, df_gpu, time_interval, threshold, iteration_times): global iteration_begin, iteration_end, iteration_index, iteration_timelines, iteration_table, blank_count, \ iteration_table_memcpy t_df_begin = df_gpu.iloc[0]['timestamp'] t_df_end = df_gpu.iloc[-1]['timestamp'] tick_begin = 0 tick_end = int(round((t_df_end - t_df_begin) / time_interval)) event_names = get_top_k_events(df_gpu, 10) event_names.append('copy_kind_1') event_names.append('copy_kind_2') events = event_names[:] HtoD = get_memcpyHtoD(df_gpu) # print(HtoD) HtoDtable = df_gpu.loc[df_gpu['name'] == HtoD[0], 'timestamp'].tolist() iteration_table_memcpy.extend(HtoDtable) iteration_table_memcpy.append(t_df_end) # print("HTOD:",iteration_table_memcpy) #events.append('timestamp') patternTable = pd.DataFrame(columns=events, dtype=int) candidate_pattern = [] iteration_pattern_count = 1 tick = 0 #Create pattern table by extract top-k feature, and label each vector with specified index while (tick < tick_end): tick_next = tick + 1 tick_event = (df_gpu.loc[:, "timestamp"] - t_df_begin) / time_interval #slice trace to block by time interval df_block = df_gpu[(tick_event < tick_next) & (tick_event >= tick)] #Create vector and count vector = [] for e in event_names: count = eventCount('name', e, df_block) vector.append(count) if sum(vector) == 0: #if vector is empty if not patternTable.empty: iteration_timelines.append('0,') else: blank_count += 1 tick += 1 continue patternMatch = patternMatching(patternTable, vector, threshold) if patternMatch != -1: iteration_timelines.append(str(iteration_index) + ",") else: iteration_timelines.append(str(iteration_pattern_count) + ",") iteration_pattern_count += 1 #vector.append(tick) vectorSerie = pd.Series(vector, index=events) patternTable = patternTable.append(vectorSerie, ignore_index=True) tick += 1 # print("totaltick:",tick) # print('timelinescount:',len(iteration_timelines)) #building suffix tree to find patter0 #print(iteration_timelines) mainString = "".join(iteration_timelines) #mainString='00000101001001001' #mainString = 'aabbcccaabbcccaabbccc' st = STree(mainString) #print(mainString) st.find_repeat_pattern(candidate_pattern, iteration_times) #print("iteration_timelines:", iteration_timelines) print("mainString:", mainString) #print("candidate_pattern:",candidate_pattern) if candidate_pattern: pattern = select_pattern(candidate_pattern) print('mainStringlen', len(mainString)) mainString = mainString.split(',') total_length = len(mainString) - 1 print('total_length', total_length) block_size = len(pattern) - pattern.count(',') block_beg = 0 block_end = block_size step = 1 #step = int(block_size/8) iteration_count = 0 fuzzyRatioTable = [] while block_beg < (total_length - block_size): blockString = ',' + ",".join(mainString[block_beg:block_end]) + ',' #use fuzzywuzzy as approximate match accuracy. TODO: use reasonable threshold. fuzz_ratio = fuzz.token_sort_ratio(blockString, pattern) fuzzyRatioTable.append(fuzz_ratio) block_beg += step block_end += step # print('fuzzyTable:',fuzzyRatioTable) # print('fuzzycount:',len(fuzzyRatioTable)) #find largest fuzzy ratio n blocks (n = iteration_times) ind = [] begTable = [] endTable = [] beg = 0 end = 0 #print(fuzzyRatioTable) for i in range(len(fuzzyRatioTable)): if fuzzyRatioTable[i] > 78: ind.append(i) # print('ind',ind) for index in ind: iteration_table.append( (float(index * step + blank_count) * time_interval + t_df_begin, float(index * step + block_size + blank_count) * time_interval + t_df_begin)) begTable.append(ind[0]) beg = ind[0] end = ind[0] + block_size # for idx in range(len(ind)): # if (ind[idx] - beg) > block_size * 2.2: # begTable.append(ind[idx]) # endTable.append(end) # beg = ind[idx] # else: # beg = ind[idx] # end = ind[idx] + block_size # endTable.append(end) # print("len of beg/end",len(begTable),len(endTable)) # # #ind = np.argpartition(fuzzyRatioTable, -iteration_times)[-iteration_times:] # #comma_factor = total_length / comma_count # print('blank:',blank_count) # for index in range(len(begTable)): # iteration_table.append((float(begTable[index]*step + blank_count) * time_interval + t_df_begin, float(endTable[index]*step + blank_count) * time_interval + t_df_begin)) else: print('No iteration patterns detected.')
next(i for i in range(len(str_lens) - 1) if state.left < str_lens[i + 1]) } res, string_set = [], set() for s in state.transition.values(): res_string, string_idxes = dfs(s) if len(res_string) > 0: res.append((string[state.left:state.right + 1] if state.left != -1 else "") + res_string) else: string_set.update(string_idxes) if len(string_set) == len(strings): res.append(string[state.left:state.right + 1]) return max(res, key=lambda x: len(x)) if res else '', string_set return dfs(tree.root)[0] if __name__ == '__main__': tree = STree() tree.build_with_automatic_end(["abacdacdacdbc"]) # tree.build_with_automatic_end(["cacaocac", "ccaooc"]) # tree.build("1234332214$") # tree.build("asjknx") print(tree) # print(lcs2("12335665464566321", "12366546456653321")) print(lcs2("cacaocac", "ccaooc")) # print(lcs(["abcdfds", "bfdbcdfew", "bcdrgde"], debug=True))
return ''.join(random.choice(string.ascii_lowercase) for _ in range(n)) if __name__ == '__main__': nlens = [100, 1000, 10000, 100000, 1000000] build_times = {} find_times_st = {} find_times_native = {} for n in nlens: print("n:", n) r = random_string(n) build_start = time.time() st = STree.STree(r) build_times[n] = time.time() - build_start print(build_times[n]) found = 0 find_times_st[n] = [] for x in range(1, 300): search_strs = [random_string(x) for _ in range(100)] find_time_sum = 0 for str in search_strs: find_start = time.time() if st.find(str) > -1: found += 1 find_time_sum += time.time() - find_start find_times_st[n].append(find_time_sum / 100.0) #print(find_time_sum / 1000.0)
def iter_detect(logdir, cfg, df, time_interval, threshold, iteration_times): global iteration_timelines, blank_count iteration_table = [] t_df_begin = df.iloc[0]['timestamp'] t_df_end = df.iloc[-1]['timestamp'] candidate_patterns = [] (main_string, name_table) = main_string_generate_v0(df) #print('AISI Symbol Table:') #print(name_table) #main_string = "0,1,1,1,1,1,0,2,3,2,3,2,3" #main_string = "49,49,49,49,49,49,1,1,2,2,1,2,1,2" #print('main_string: '+main_string) #quit() st = STree(main_string) st.find_repeat_pattern(candidate_patterns, iteration_times) candidate_patterns.sort(key=lambda s: len(s), reverse=True) filtered_candidate_patterns = pattern_filter(candidate_patterns) #print('There are %d candidate patterns for %d-iteration'%(len(candidate_patterns),cfg.num_iterations)) pattern_pre = "" pat_seq = [] for pattern in filtered_candidate_patterns: #NOTE: To prevent using similar patterns for scanning pp_ratio = fuzz.ratio(pattern, pattern_pre) if pp_ratio > 80: continue else: pattern_pre = pattern wid_seq = main_string.split(',') pat_seq = pattern.split(',') total_length = num_wids = len(wid_seq) #print('runtime string length = %d' % num_wids ) #print('pattern length (block_size) = %d' % len(pat_seq)) block_size = len(pat_seq) matched_block_end_pre = 0 block_begin = 0 block_end = 0 block_end = block_size step = 1 iteration_count = 0 b_overlap = False fuzzy_threshold = 90 ind = [] while block_begin <= (total_length - block_size): blockString = ",".join(wid_seq[block_begin:block_end]) fuzz_ratio = fuzz.ratio(blockString, pattern) if fuzz_ratio >= fuzzy_threshold: ind.append(block_begin) block_begin = block_end block_end = block_begin + block_size else: block_begin = block_begin + step block_end = block_begin + block_size #print('Non-overlapped %d-times pattern: %s'%(len(ind),pattern)) #print('============ iteration_table ============================') if len(ind) == cfg.num_iterations: for i in ind: iteration_table.append( (df.iloc[i]['timestamp'], df.iloc[i + block_size - 1]['timestamp'])) break else: #print_warning("No matched strings by fuzzywuzzy of threshold %d."%fw_threshold) continue #print(iteration_table) #print('=========================================================') return pat_seq, iteration_table, name_table
import string import random def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) import STree if __name__ == '__main__': a = ["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] st = STree.STree(a) print(st.lcs()) text = "name language w en url http w namelanguage en url http" stree = STree.STree(text) print(stree.find('law')) st = STree.STree("abcdefghab") print(st.find("abc")) # 0 print(st.find_all("ab")) # [0, 8] ---> [] :-(