def main():
    config = parseArgs()
    
    print(f'config >{config}<')
    
    check_if_dir_exists(config['pickle_dir'])
    check_if_dir_exists(config['work_dir'])
    check_if_dir_exists(config['save_dir'])
    check_if_dir_exists(config['tfrecord_save_dir'])
    
    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got nr_of_cpus >{nr_of_cpus}<')
    
    ### get all pickle files
    #pickle_files = get_all_tar_filenames(config['pickle_dir'])
    pickle_files = common_stuff_lib.get_all_filenames_of_type(config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)
    
    
    ### build
    p = Pool(nr_of_cpus)
    
    
    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config))
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()
      
    
    print("Done. Run build_ret_type__vocab__seq_len.py next")
Ejemplo n.º 2
0
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()

    copy_files_to_build_dataset(config)

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ### build
    p = Pool(nr_of_cpus)

    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']),
                    repeat(config['save_dir']), repeat(config))
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    print(f'Run build_ret_type__vocab__seq_len.py next')
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading\n')
    print()

    copy_files_to_build_dataset(config)

    ### get all pickle files
    #pickle_files = get_all_tar_filenames(config['pickle_dir'])
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ### build
    #     p = Pool(nr_of_cpus)
    #     #p = Pool(len(pickle_files))
    #
    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']),
                    repeat(config['save_dir']), repeat(config))
    #
    #     all_ret_types = p.starmap(proc_build, star_list)
    #     p.close()
    #     p.join()

    test = joblib.Parallel(n_jobs=-1, prefer="processes")(
        joblib.delayed(proc_build)(a, b, c, d) for a, b, c, d in star_list)

    print("Done. Run build_ret_type__vocab__seq_len.py next")
def main():
    config = common_stuff_lib.parseArgs()
    print(f'config >{config}<')
    print()
    check_config(config)
    
    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()
    
    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle')
    
    if len(pickle_files) == 0:
        print(f"There are no files in >{config['balanced_dataset_dir']}<")
        exit()
        
    #print(f'pickle-files we use to build >{pickle_files}<')
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)
    
    print(f'Building return-type dict, vocabulary and max-squenece-length')
    print()
    
    p = Pool(nr_of_cpus)
    
    pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files]
    
    star_list = zip(pickle_files, repeat(config))
    
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()
    
    ret_set = set()
    vocab = set()
    seq_length = 0
    
    ##put all stuff together
    for ret_set1, vocab1, seq_length1 in all_ret_types:
        ret_set.update(ret_set1)
        vocab.update(vocab1)
        if seq_length1 > seq_length:
            seq_length = seq_length1
    
    
        
    print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<")
    print()
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    ret_set_list = sorted(ret_set)
    for elem in ret_set_list:
        ret_type_dict[elem] = counter
        counter += 1
    
    print(f"ret-type-dict :")
    for key in ret_type_dict:
        print(f"nr-of-args >{key}<  label >{ret_type_dict[key]}<")
    print()
        
    pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file'])
        
    print(f"Saving vocabulary to >{config['vocabulary_file']}<")
    print()
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])
    
    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    print()
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])
    
    
    print("Done. Run transform_ret_type_to_int.py next")
def main():
    config = parseArgs()
    print(f'config >{config}<')
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got nr_of_cpus >{nr_of_cpus}<')

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['balanced_dataset_dir'], '.pickle')
    #print(f'pickle-files we use to build >{pickle_files}<')
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    print(f'Building return-type dict, vocabulary and max-squenece-length')

    p = Pool(nr_of_cpus)

    pickle_files = [
        config['balanced_dataset_dir'] + "/" + f for f in pickle_files
    ]

    star_list = zip(pickle_files, repeat(config))

    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ret_set = set()
    vocab = set()
    seq_length = 0

    ##put all stuff together
    for ret_set1, vocab1, seq_length1 in all_ret_types:
        ret_set.update(ret_set1)
        vocab.update(vocab1)
        if seq_length1 > seq_length:
            seq_length = seq_length1


#     ret_set = set()
#     vocab = set()
#     seq_length = 0
#     counter = 1
#     pickle_count = len(pickle_files)
#
#     for file in pickle_files:
#         print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
#         counter += 1
#         cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
#         for item in cont:
#             #print(f'item-1 >{item[1]}<')
#             ## build ret-type-dict
#             ret_set.add(item[1])
#
#             ##build max-seq-length
#             if len(item[0]) > seq_length:
#                 seq_length = len(item[0])
#
#             ## build vocabulary
#             for word in item[0].split():
#                 vocab.add(word)

    print(
        f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    ret_set_list = sorted(ret_set)
    for elem in ret_set_list:
        ret_type_dict[elem] = counter
        counter += 1

    print(f"ret-type-dict :")
    for key in ret_type_dict:
        print(f"nr-of-args >{key}<  label >{ret_type_dict[key]}<")

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Saving vocabulary to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    print("Done. Run transform_ret_type_to_int.py next")
def main():
    config = parseArgs()

    print(f'config >{config}<')

    check_if_dir_exists(config['pickle_dir'])
    check_if_dir_exists(config['work_dir'])
    check_if_dir_exists(config['save_dir'])
    check_if_dir_exists(config['tfrecord_save_dir'])

    ### get all pickle files
    #pickle_files = get_all_tar_filenames(config['pickle_dir'])
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ### build
    p = Pool(nr_of_cpus)

    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']),
                    repeat(config['save_dir']), repeat(config))
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')
    print(f'pickle-files >{pickle_files}<')

    print(f'Building return-type dict, vocabulary and max-squenece-length')
    ret_set = set()
    vocab = set()
    seq_length = 0
    counter = 1
    pickle_count = len(pickle_files)

    for file in pickle_files:
        print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            #print(f'item-1 >{item[1]}<')
            ## build ret-type-dict
            ret_set.add(item[1])

            ##build max-seq-length
            if len(item[0]) > seq_length:
                seq_length = len(item[0])

            ## build vocabulary
            for word in item[0].split():
                vocab.add(word)

    print(
        f"Build return-type dict and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    for elem in ret_set:
        ret_type_dict[elem] = counter
        counter += 1

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Build vocabulary and save it to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    ### transform dataset ret-types to ints
    print(
        f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<"
    )
    trans_ds = list()
    counter = 1
    for file in pickle_files:
        print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            trans_ds.append((item[0], ret_type_dict[item[1]]))

        tfrecord_lib.save_caller_callee_to_tfrecord(
            trans_ds,
            config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord'))

    print("Splitting dataset to train,val,test")
    tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir'])

    print("Done. Run build_caller_callee_model.py now")
Ejemplo n.º 7
0
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading\n')
    print()

    copy_files_to_build_dataset(config)

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ###loop through all pickle.tar.bz2 files
    for pickle_file in pickle_files:
        print(f"Untar pickle-file >{pickle_file}< to >{config['work_dir']}<")

        tarbz2_lib.untar_file_to_path(config['pickle_dir'] + pickle_file,
                                      config['work_dir'])

        ###install source-package of pickle-file-content
        pickle_file_name = os.path.basename(pickle_file)
        pickle_file_name = pickle_file_name.replace('.pickle.tar.bz2', '')

        gdb_lib.install_source_package(pickle_file_name, config)

        ###check with gdb (list cmd) if the sources are newer/older than binary
        ## warning: Source file is more recent than executable.
        ###get dir name

        dir_name = config['ubuntu_src_pkgs']
        ##dir_name = get_dirname_of_src(pickle_file_name)
        print(f'Dir with src is:{dir_name}')
        res = check_if_src_match_binary(pickle_file_name, dir_name, config)

        ##src and binary dont match, unpack the second src in the dir
        if not res:
            unpack_second_src(pickle_file_name)
            res = check_if_src_match_binary(pickle_file_name, dir_name)
            print(f'res of second src dir: {res}')
        else:
            print(f'src match binary')

        #break

        ###open the pickle
        print(
            f"Open untarred pickle file: >{config['work_dir'] + pickle_file}<")
        pickle_content = open_pickle(config['work_dir'] +
                                     pickle_file.replace('.tar.bz2', ''))

        #print(f'pickle_content >{pickle_content}<')

        #exit()

        fcn = ''
        fl = ''
        bina = ''
        gdb_func_sign = ''
        ### loop through the pickle-file and get source-code from function
        #print(f'pickle-content: {next(iter(pickle_content))}')
        for funcSign, gdb_ret_type, func_name, file_name, disas_att, disas_intel, package_name, binary in pickle_content:
            print(f'funcSign: {funcSign}')
            #print(f'gdb_ret_type: {gdb_ret_type}')
            print(f'func_name: {func_name}')
            print(f'file_name: {file_name}')
            #print(f'disas_att: {disas_att}')
            #print(f'disas_intel: {disas_intel}')
            print(f'package_name: {package_name}')
            print(f'binary: {binary}')
            fcn = func_name
            fl = file_name
            bina = binary
            gdb_func_sign = funcSign
            #break

            ### get source code of function
            pkg_name = pickle_file.replace('.pickle.tar.bz2', '')
            pkg_name = os.path.basename(pkg_name)
            print(f'pkg_name:{pkg_name}')

            #pkg_src_name = "/tmp/" + pkg_name + "/" + dir_name
            pkg_src_name = config['ubuntu_src_pkgs']

            print(f'pkg_src_name:{pkg_src_name}')

            full_path = get_full_path(pkg_src_name, fl)
            print(f'full-path:{full_path}')

            len_full_path = len(full_path)
            nr_of_empty_src_code = 0

            ### ctags does not get return-type if its located lines above func_name
            ### gdb funcSign got it, we need to check if we need more lines than ctags tells us
            for f in full_path:
                src_code = get_source_code(f, fcn, gdb_func_sign)
                if src_code:
                    print(f'src-code:{src_code}')
                else:
                    print(f'no src-code found')
                    nr_of_empty_src_code += 1

            print(
                f'nr_of_empty_src_code:{nr_of_empty_src_code}   len_full_path:{len_full_path}'
            )
            if len_full_path == nr_of_empty_src_code + 1:
                print('only found one source code, thats good')
            else:
                print('ERROR found more than one source code for a function')
                break