def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, func_signature
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    return_type_func_sign = return_type_lib.get_return_type_from_function_signature(
                                        elem2[0])
                                    return_type = return_type_lib.get_return_type_from_gdb_ptype(
                                        elem2[1])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    if return_type == 'unknown':
                                        print(
                                            f'string_before_func_name: {string_before_func_name}'
                                        )

                                    if return_type == 'unknown':
                                        #print('unknown found')
                                        #breaker = True
                                        #break
                                        pass
                                    elif return_type == 'delete':
                                        #print('delete found')
                                        ### no return type found, so delete this item
                                        pass
                                    elif return_type == 'process_further':
                                        print(f'ERRROOOORRRR---------------')
                                    else:

                                        dis1_str = ' '.join(att_dis)
                                        dis2_str = ' '.join(elem2[4])

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        dis_str = dis1_str + dis2_str

                                        #print(f'dis_str >{dis_str}<')

                                        dataset_list.append(
                                            (dis_str, return_type))
                                        counter += 1
                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])
                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])
                                    arg_nr_we_want = 3
                                    if nr_of_args < arg_nr_we_want:
                                        #print(f'func got to less args for us')
                                        break

                                    arg_two = return_type_lib.get_arg_two_name_from_function_signature(
                                        elem2[0])

                                    result = common_stuff_lib.is_type_known(
                                        arg_two)

                                    if result == False:
                                        #print(f'arg_two not a known type')
                                        pass
                                    else:
                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > 100000) or (
                                                len(dis2_str) > 100000) or (
                                                    len(dis1_str) <
                                                    1) or (len(dis2_str) < 1):
                                            print(
                                                f'dis1_str >{len(dis1_str)}<')
                                            print(
                                                f'dis2_str >{len(dis2_str)}<')
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, arg_two))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
Example #3
0
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    if (len(elem2[4]) >
                                        (int(config[
                                            'tokenized_disassembly_length']) /
                                         2)) or (len(att_dis) > (int(config[
                                             'tokenized_disassembly_length']) /
                                                                 2)
                                                 ) or (len(elem2[4]) < 1) or (
                                                     len(att_dis) < 1):
                                        continue

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])

                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    #                                     if return_type == 'unknown':
                                    #                                         print(f'string_before_func_name: {return_type_func_sign}')
                                    #
                                    #                                     if return_type == 'unknown':
                                    #                                         #print('unknown found')
                                    #                                         #breaker = True
                                    #                                         #break
                                    #                                         pass
                                    #                                     elif return_type == 'delete':
                                    #                                         #print('delete found')
                                    #                                         ### no return type found, so delete this item
                                    #                                         pass
                                    #                                     elif return_type == 'process_further':
                                    #                                         print(f'ERRROOOORRRR---------------')
                                    if nr_of_args == -1:
                                        print(f'Error nr_of_args')
                                    else:
                                        print(f'nr_of_args >{nr_of_args}<',
                                              end='\r')

                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)
                                        #print(f'dis1_str >{dis1_str}<')

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                 ) / 2)
                                            ) or (len(dis2_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                       ) / 2)
                                                  ) or (len(dis1_str) < 1) or (
                                                      len(dis2_str) < 1):
                                            print(
                                                f'tokenized_disassembly_length caller >{len(dis1_str)}<'
                                            )
                                            print(
                                                f'tokenized_disassembly_length callee >{len(dis2_str)}<'
                                            )
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, nr_of_args))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter