Ejemplo n.º 1
0
def lcs2(string1, string2, debug=False):
    string = string1 + "$" + string2 + "$"
    tree = STree()
    tree.build_with_automatic_end([string1, string2])
    if debug:
        print(tree)

    def dfs(state: State):
        if state.right == float('inf'):
            return '', state.left >= len(string) - len(string2) - 1
        res, has_string1, has_string2 = [], False, False
        for s in state.transition.values():
            res_string, is_string2 = dfs(s)
            if len(res_string) > 0:
                res.append((string[state.left:state.right +
                                   1] if state.left != -1 else "") +
                           res_string)
            else:
                if is_string2:
                    has_string2 = True
                else:
                    has_string1 = True
        if has_string1 and has_string2:
            res.append(string[state.left:state.right + 1])
        return max(res, key=lambda x: len(x)) if res else '', is_string2

    return dfs(tree.root)[0]
Ejemplo n.º 2
0
def lcs(strings, debug=False):
    string = '$'.join(strings) + '$'
    str_lens = list(accumulate(strings, lambda x, y: x + len(y) + 1,
                               initial=0))
    tree = STree()
    tree.build_with_automatic_end(strings)
    if debug:
        print(tree)

    def dfs(state: State):
        if state.right == float('inf'):
            return '', {
                next(i for i in range(len(str_lens) - 1)
                     if state.left < str_lens[i + 1])
            }
        res, string_set = [], set()
        for s in state.transition.values():
            res_string, string_idxes = dfs(s)
            if len(res_string) > 0:
                res.append((string[state.left:state.right +
                                   1] if state.left != -1 else "") +
                           res_string)
            else:
                string_set.update(string_idxes)
        if len(string_set) == len(strings):
            res.append(string[state.left:state.right + 1])
        return max(res, key=lambda x: len(x)) if res else '', string_set

    return dfs(tree.root)[0]
Ejemplo n.º 3
0
def iter_detect(logdir, cfg, df_gpu, time_interval, threshold, iteration_times):
    global iteration_timelines, iteration_table, blank_count, \
            iteration_table
    df_gpu.to_csv('dfgpu.csv')
    t_df_begin = df_gpu.iloc[0]['timestamp'] 
    t_df_end = df_gpu.iloc[-1]['timestamp'] 
    candidate_patterns=[]
    (main_string,name_table) = main_string_generate_v1(df_gpu)
    #main_string = "0,1,1,1,1,1,0,2,3,2,3,2,3"
    #main_string = "49,49,49,49,49,49,1,1,2,2,1,2,1,2"
    #print('main_string: '+main_string)
    st = STree(main_string)
    st.find_repeat_pattern(candidate_patterns, iteration_times)
    candidate_patterns.sort(key = lambda s: len(s), reverse=True)
    filtered_candidate_patterns = pattern_filter(candidate_patterns)
    print('There are %d candidate patterns for %d-iteration'%(len(candidate_patterns),cfg.num_iterations))
    pattern_pre = ""
    pat_seq = []
    for pattern in filtered_candidate_patterns:
        #print('A: ',pattern_pre)
        #print('B: ',pattern)
        pp_ratio = fuzz.ratio(pattern,pattern_pre)
        if pp_ratio > 80:
            #print("pattern too similar: ",pp_ratio) 
            continue
        else:
            pattern_pre = pattern 
        #print('original string length of main_string = %d' % len(main_string))
        wid_seq = main_string.split(',')
        pat_seq = pattern.split(',')
        total_length = num_wids = len(wid_seq)
        #print('runtime string length = %d' % num_wids )
        #print('pattern length (block_size) = %d' % len(pat_seq)) 
        block_size = len(pat_seq)
        matched_block_end_pre = 0 
        block_begin = 0
        block_end = 0
        block_end = block_size
        step = 1
        iteration_count = 0
        b_overlap = False
        fw_threshold = 80
        ind = []
        while block_begin <= (total_length - block_size):
            blockString = ",".join(wid_seq[block_begin:block_end])
            fuzz_ratio = fuzz.ratio(blockString,pattern)
            if fuzz_ratio >= fw_threshold:
                ind.append(block_begin)
                block_begin = block_end
                block_end = block_begin + block_size
            else:
                block_begin = block_begin + step
                block_end = block_begin + block_size

        #print('Non-overlapped %d-times pattern: %s'%(len(ind),pattern))
        if len(ind) == cfg.num_iterations:    
            for i in ind:
                iteration_table.append((df_gpu.iloc[i]['timestamp'],df_gpu.iloc[i+block_size-1]['timestamp']))
            break  
        else:
            #print_warning("No matched strings by fuzzywuzzy of threshold %d."%fw_threshold)
            continue
    print("Selected pattern:")
    print(pat_seq)
    print("End of AISI")
Ejemplo n.º 4
0
def iter_detect(logdir, cfg, df_gpu, time_interval, threshold,
                iteration_times):
    global iteration_timelines, iteration_table, blank_count, \
            iteration_table
    df_gpu.to_csv('dfgpu.csv')
    t_df_begin = df_gpu.iloc[0]['timestamp']
    t_df_end = df_gpu.iloc[-1]['timestamp']
    candidate_pattern = []
    main_string = main_string_generate_v1(df_gpu)
    st = STree(main_string)
    st.find_repeat_pattern(candidate_pattern, iteration_times)
    #print("main_string:", main_string)
    pattern = select_pattern(candidate_pattern)
    if pattern:
        print('original string length of main_string = %d' % len(main_string))
        wid_seq = main_string.split(',')
        pat_seq = pattern.split(',')
        total_length = num_wids = len(wid_seq)
        print('context string length = %d' % num_wids)
        print('pattern length = %d' % len(pat_seq))
        #block_size = pattern.count(',') + 1
        print(pat_seq)
        block_size = len(pat_seq)

        block_begin = 0
        block_end = block_size
        step = 1
        #step = int(block_size/8)
        iteration_count = 0
        fuzzyRatioTable = []
        while block_begin <= (total_length - block_size):
            blockString = ",".join(wid_seq[block_begin:block_end])
            #print('A(%d): %s' % (len(blockString),blockString))
            #print('B(%d): %s' % (len(pattern), pattern))
            fuzz_ratio = fuzz.ratio(blockString, pattern)

            fuzzyRatioTable.append((block_begin, fuzz_ratio))
            #print(fuzz_ratio)
            block_begin = block_begin + step
            block_end = block_begin + block_size
        #find largest fuzzy ratio n blocks (n = iteration_times)
        ind = []
        begTable = []
        endTable = []
        beg = 0
        end = 0
        fw_threshold = 80
        for i in range(len(fuzzyRatioTable)):
            if fuzzyRatioTable[i][1] > fw_threshold:
                ind.append(fuzzyRatioTable[i][0])

        if len(ind) > 0:
            #plot event by event
            for i in ind:
                iteration_table.append(
                    (df_gpu.iloc[i]['timestamp'],
                     df_gpu.iloc[i + block_size - 1]['timestamp']))
            begTable.append(ind[0])
            beg = ind[0]
            end = ind[0] + block_size
            #plot window by window
            #for index in ind:
            #    iteration_table.append((float(index*step) * time_interval + t_df_begin, float(index*step + block_size) * time_interval + t_df_begin))
            #begTable.append(ind[0])
            #beg = ind[0]
            #end = ind[0] + block_size
        else:
            print_warning("No matched strings by fuzzywuzzy of threshold %d." %
                          fw_threshold)
    else:
        print('No iteration patterns detected.')
Ejemplo n.º 5
0
def iterationDetection(logdir, cfg, df_gpu, time_interval, threshold,
                       iteration_times):
    global iteration_begin, iteration_end, iteration_index, iteration_timelines, iteration_table, blank_count, \
            iteration_table_memcpy
    t_df_begin = df_gpu.iloc[0]['timestamp']
    t_df_end = df_gpu.iloc[-1]['timestamp']
    tick_begin = 0
    tick_end = int(round((t_df_end - t_df_begin) / time_interval))
    event_names = get_top_k_events(df_gpu, 10)
    event_names.append('copy_kind_1')
    event_names.append('copy_kind_2')
    events = event_names[:]
    HtoD = get_memcpyHtoD(df_gpu)
    #    print(HtoD)
    HtoDtable = df_gpu.loc[df_gpu['name'] == HtoD[0], 'timestamp'].tolist()
    iteration_table_memcpy.extend(HtoDtable)
    iteration_table_memcpy.append(t_df_end)
    #    print("HTOD:",iteration_table_memcpy)
    #events.append('timestamp')
    patternTable = pd.DataFrame(columns=events, dtype=int)
    candidate_pattern = []
    iteration_pattern_count = 1
    tick = 0
    #Create pattern table by extract top-k feature, and label each vector with specified index
    while (tick < tick_end):
        tick_next = tick + 1
        tick_event = (df_gpu.loc[:, "timestamp"] - t_df_begin) / time_interval
        #slice trace to block by time interval
        df_block = df_gpu[(tick_event < tick_next) & (tick_event >= tick)]
        #Create vector and count
        vector = []
        for e in event_names:
            count = eventCount('name', e, df_block)
            vector.append(count)
        if sum(vector) == 0:
            #if vector is empty
            if not patternTable.empty:
                iteration_timelines.append('0,')
            else:
                blank_count += 1
            tick += 1
            continue
        patternMatch = patternMatching(patternTable, vector, threshold)
        if patternMatch != -1:
            iteration_timelines.append(str(iteration_index) + ",")

        else:
            iteration_timelines.append(str(iteration_pattern_count) + ",")
            iteration_pattern_count += 1
            #vector.append(tick)
            vectorSerie = pd.Series(vector, index=events)
            patternTable = patternTable.append(vectorSerie, ignore_index=True)
        tick += 1
#    print("totaltick:",tick)
#    print('timelinescount:',len(iteration_timelines))
#building suffix tree to find patter0
#print(iteration_timelines)
    mainString = "".join(iteration_timelines)
    #mainString='00000101001001001'
    #mainString = 'aabbcccaabbcccaabbccc'
    st = STree(mainString)
    #print(mainString)
    st.find_repeat_pattern(candidate_pattern, iteration_times)
    #print("iteration_timelines:", iteration_timelines)
    print("mainString:", mainString)
    #print("candidate_pattern:",candidate_pattern)
    if candidate_pattern:
        pattern = select_pattern(candidate_pattern)
        print('mainStringlen', len(mainString))
        mainString = mainString.split(',')
        total_length = len(mainString) - 1
        print('total_length', total_length)
        block_size = len(pattern) - pattern.count(',')
        block_beg = 0
        block_end = block_size
        step = 1
        #step = int(block_size/8)
        iteration_count = 0
        fuzzyRatioTable = []
        while block_beg < (total_length - block_size):
            blockString = ',' + ",".join(mainString[block_beg:block_end]) + ','
            #use fuzzywuzzy as approximate match accuracy. TODO: use reasonable threshold.
            fuzz_ratio = fuzz.token_sort_ratio(blockString, pattern)
            fuzzyRatioTable.append(fuzz_ratio)
            block_beg += step
            block_end += step
#        print('fuzzyTable:',fuzzyRatioTable)
#        print('fuzzycount:',len(fuzzyRatioTable))
#find largest fuzzy ratio n blocks (n = iteration_times)
        ind = []
        begTable = []
        endTable = []
        beg = 0
        end = 0
        #print(fuzzyRatioTable)
        for i in range(len(fuzzyRatioTable)):
            if fuzzyRatioTable[i] > 78:
                ind.append(i)
#        print('ind',ind)
        for index in ind:
            iteration_table.append(
                (float(index * step + blank_count) * time_interval +
                 t_df_begin,
                 float(index * step + block_size + blank_count) * time_interval
                 + t_df_begin))

        begTable.append(ind[0])
        beg = ind[0]
        end = ind[0] + block_size


#        for idx in range(len(ind)):
#            if (ind[idx] - beg) > block_size * 2.2:
#                begTable.append(ind[idx])
#                endTable.append(end)
#                beg = ind[idx]
#            else:
#                beg = ind[idx]
#                end = ind[idx] + block_size
#        endTable.append(end)
#        print("len of beg/end",len(begTable),len(endTable))
#
#        #ind = np.argpartition(fuzzyRatioTable, -iteration_times)[-iteration_times:]
#        #comma_factor = total_length / comma_count
#        print('blank:',blank_count)
#        for index in range(len(begTable)):
#            iteration_table.append((float(begTable[index]*step + blank_count) * time_interval + t_df_begin, float(endTable[index]*step + blank_count) * time_interval + t_df_begin))

    else:
        print('No iteration patterns detected.')
Ejemplo n.º 6
0
                next(i for i in range(len(str_lens) - 1)
                     if state.left < str_lens[i + 1])
            }
        res, string_set = [], set()
        for s in state.transition.values():
            res_string, string_idxes = dfs(s)
            if len(res_string) > 0:
                res.append((string[state.left:state.right +
                                   1] if state.left != -1 else "") +
                           res_string)
            else:
                string_set.update(string_idxes)
        if len(string_set) == len(strings):
            res.append(string[state.left:state.right + 1])
        return max(res, key=lambda x: len(x)) if res else '', string_set

    return dfs(tree.root)[0]


if __name__ == '__main__':
    tree = STree()
    tree.build_with_automatic_end(["abacdacdacdbc"])
    # tree.build_with_automatic_end(["cacaocac", "ccaooc"])
    # tree.build("1234332214$")
    # tree.build("asjknx")
    print(tree)

    # print(lcs2("12335665464566321", "12366546456653321"))
    print(lcs2("cacaocac", "ccaooc"))
    # print(lcs(["abcdfds", "bfdbcdfew", "bcdrgde"], debug=True))
Ejemplo n.º 7
0
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(n))


if __name__ == '__main__':
    nlens = [100, 1000, 10000, 100000, 1000000]

    build_times = {}
    find_times_st = {}
    find_times_native = {}

    for n in nlens:
        print("n:", n)
        r = random_string(n)

        build_start = time.time()
        st = STree.STree(r)
        build_times[n] = time.time() - build_start
        print(build_times[n])

        found = 0
        find_times_st[n] = []
        for x in range(1, 300):
            search_strs = [random_string(x) for _ in range(100)]
            find_time_sum = 0
            for str in search_strs:
                find_start = time.time()
                if st.find(str) > -1:
                    found += 1
                find_time_sum += time.time() - find_start
            find_times_st[n].append(find_time_sum / 100.0)
            #print(find_time_sum / 1000.0)
Ejemplo n.º 8
0
def iter_detect(logdir, cfg, df, time_interval, threshold, iteration_times):
    global iteration_timelines, blank_count
    iteration_table = []
    t_df_begin = df.iloc[0]['timestamp']
    t_df_end = df.iloc[-1]['timestamp']
    candidate_patterns = []
    (main_string, name_table) = main_string_generate_v0(df)
    #print('AISI Symbol Table:')
    #print(name_table)
    #main_string = "0,1,1,1,1,1,0,2,3,2,3,2,3"
    #main_string = "49,49,49,49,49,49,1,1,2,2,1,2,1,2"
    #print('main_string: '+main_string)
    #quit()
    st = STree(main_string)
    st.find_repeat_pattern(candidate_patterns, iteration_times)
    candidate_patterns.sort(key=lambda s: len(s), reverse=True)
    filtered_candidate_patterns = pattern_filter(candidate_patterns)
    #print('There are %d candidate patterns for %d-iteration'%(len(candidate_patterns),cfg.num_iterations))
    pattern_pre = ""
    pat_seq = []
    for pattern in filtered_candidate_patterns:
        #NOTE: To prevent using similar patterns for scanning
        pp_ratio = fuzz.ratio(pattern, pattern_pre)
        if pp_ratio > 80:
            continue
        else:
            pattern_pre = pattern

        wid_seq = main_string.split(',')
        pat_seq = pattern.split(',')
        total_length = num_wids = len(wid_seq)
        #print('runtime string length = %d' % num_wids )
        #print('pattern length (block_size) = %d' % len(pat_seq))
        block_size = len(pat_seq)
        matched_block_end_pre = 0
        block_begin = 0
        block_end = 0
        block_end = block_size
        step = 1
        iteration_count = 0
        b_overlap = False
        fuzzy_threshold = 90
        ind = []
        while block_begin <= (total_length - block_size):
            blockString = ",".join(wid_seq[block_begin:block_end])

            fuzz_ratio = fuzz.ratio(blockString, pattern)
            if fuzz_ratio >= fuzzy_threshold:
                ind.append(block_begin)
                block_begin = block_end
                block_end = block_begin + block_size
            else:
                block_begin = block_begin + step
                block_end = block_begin + block_size

        #print('Non-overlapped %d-times pattern: %s'%(len(ind),pattern))
        #print('============ iteration_table ============================')
        if len(ind) == cfg.num_iterations:
            for i in ind:
                iteration_table.append(
                    (df.iloc[i]['timestamp'],
                     df.iloc[i + block_size - 1]['timestamp']))
            break
        else:
            #print_warning("No matched strings by fuzzywuzzy of threshold %d."%fw_threshold)
            continue

    #print(iteration_table)
    #print('=========================================================')

    return pat_seq, iteration_table, name_table
Ejemplo n.º 9
0
import string
import random
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

import STree

if __name__ == '__main__':
    a = ["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"]
    st = STree.STree(a)
    print(st.lcs())

    text = "name language w en url http w namelanguage en url http"
    stree = STree.STree(text)
    print(stree.find('law'))
    
    st = STree.STree("abcdefghab")
    print(st.find("abc")) # 0
    print(st.find_all("ab")) # [0, 8] ---> [] :-(