def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])
                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])
                                    arg_nr_we_want = 3
                                    if nr_of_args < arg_nr_we_want:
                                        #print(f'func got to less args for us')
                                        break

                                    arg_two = return_type_lib.get_arg_two_name_from_function_signature(
                                        elem2[0])

                                    result = common_stuff_lib.is_type_known(
                                        arg_two)

                                    if result == False:
                                        #print(f'arg_two not a known type')
                                        pass
                                    else:
                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > 100000) or (
                                                len(dis2_str) > 100000) or (
                                                    len(dis1_str) <
                                                    1) or (len(dis2_str) < 1):
                                            print(
                                                f'dis1_str >{len(dis1_str)}<')
                                            print(
                                                f'dis2_str >{len(dis2_str)}<')
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, arg_two))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, func_signature
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    return_type_func_sign = return_type_lib.get_return_type_from_function_signature(
                                        elem2[0])
                                    return_type = return_type_lib.get_return_type_from_gdb_ptype(
                                        elem2[1])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    if return_type == 'unknown':
                                        print(
                                            f'string_before_func_name: {string_before_func_name}'
                                        )

                                    if return_type == 'unknown':
                                        #print('unknown found')
                                        #breaker = True
                                        #break
                                        pass
                                    elif return_type == 'delete':
                                        #print('delete found')
                                        ### no return type found, so delete this item
                                        pass
                                    elif return_type == 'process_further':
                                        print(f'ERRROOOORRRR---------------')
                                    else:

                                        dis1_str = ' '.join(att_dis)
                                        dis2_str = ' '.join(elem2[4])

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        dis_str = dis1_str + dis2_str

                                        #print(f'dis_str >{dis_str}<')

                                        dataset_list.append(
                                            (dis_str, return_type))
                                        counter += 1
                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
    def run(self):
        self.updateProgressBar.emit(1)
        
        curr_pos = cutter.cmd('s')
        if curr_pos.strip() == '0x0':
            print('runInference not from addr 0x0')
            return
         
        self.set_new_radare2_e()
        
        ### get name of current function
        current_func_name = cutter.cmdj("afdj $F").get('name')
         
        #print(f'current_func_name >{current_func_name}<')
        
        ## find data/code references to this address with $F
        current_func_header = cutter.cmdj("axtj $F")
        
        ## get addr of callee
        caller_addr = 0
        for item_dicts in current_func_header:
            print(f'item_dicts >{item_dicts}<')
            for elem in item_dicts:
                if elem == 'from':
                    caller_addr = item_dicts[elem]
                    #print(f'address of caller >{item_dicts[elem]}<')
                 
         
        ## get disassembly of current/callee function
        address = cutter.cmd('s').strip()
        #print(f'address >{address}<')
        if address == '0x0':
            self.updateProgressBar.emit(100)
            self.resultReady.emit('')
            self.summaryReady.emit('Current address is 0x0, choose other function')
            #print(f'address kicked >{address}<')
            return
        
        disasm_callee_str = self.get_disassembly_of(address)
        #print(f'disasm_callee_str >{disasm_callee_str}<')
        
        ### get disassembly of caller function
        disasm_caller_str = self.get_disassembly_of(caller_addr)
        #print(f'disasm_caller_str >{disasm_caller_str}<')
  
        ### split disas for the tf-model     
        disasm_caller_str = disassembly_lib.split_disassembly(disasm_caller_str)
        disasm_callee_str = disassembly_lib.split_disassembly(disasm_callee_str)

        ##check if we got caller and callee disassembly
        if (len(disasm_caller_str) == 0) or (len(disasm_callee_str) == 0):
            self.updateProgressBar.emit(100)
            self.resultReady.emit('')
            self.summaryReady.emit('No caller disassembly found, choose other function')
            
            #print(f'Not found callee and caller disassembly.')
            return
         
        ### the path were we cloned git repo to
        self._userHomePath = os.path.expanduser('~')
        #print(f'userHomePath >{self._userHomePath}<')
        
        func_sign_prob_git_path = self._userHomePath + "/git/func_sign_prob/"
        
        self.updateProgressBar.emit(9)
         
        callee_addr_split = [char if char != '0' else 'null' for char in address]
        callee_addr = ' '.join(callee_addr_split)
        disasm_caller_callee_predict_str = disasm_caller_str + ' caller_callee_separator ' + callee_addr + ' ' + disasm_callee_str
        ### predict now ret-type
        ret_type_prediction_summary_str = self.get_prediction('return_type/words_100000', 
                                                                disasm_caller_callee_predict_str, 
                                                                func_sign_prob_git_path)
        
        
          
        ## store for later, will be overridden
        ret_type_model_summary_str = self.model_summary_str
        ret_type_biggest_prob = self.biggest_prob
        ret_type_biggest_prob_type = self.biggest_prob_type
        ret_type_biggest_prob_percent = 100 * ret_type_biggest_prob
        self.updateProgressBar.emit(10)
                  
        ### predict now nr_of_args
        nr_of_args_prediction_summary_str = self.get_prediction('nr_of_args/words_100000', 
                                                                disasm_caller_callee_predict_str, 
                                                                func_sign_prob_git_path)
                   
                
        ## store for later, will be overridden
        nr_of_args_model_summary_str = self.model_summary_str
        nr_of_args_biggest_prob = self.biggest_prob
        nr_of_args_biggest_prob_type = self.biggest_prob_type
        self.updateProgressBar.emit(20)
           
        ###predict now arg_one
        arg_one_prediction_summary_str = self.get_prediction('arg_one', 
                                                                disasm_caller_callee_predict_str, 
                                                                func_sign_prob_git_path)
            

        ## store for later, will be overridden
        arg_one_model_summary_str = self.model_summary_str
        arg_one_biggest_prob = self.biggest_prob
        arg_one_biggest_prob_type = self.biggest_prob_type
        arg_one_biggest_prob_percent = 100 * arg_one_biggest_prob
        
            
        #nr_of_args_biggest_prob_type = 1
        if nr_of_args_biggest_prob_type == 1:
            
            self.set_stored_radare2_e()
            
            self.updateProgressBar.emit(100)
            
            self.summaryReady.emit(f"tf return type model summary:\n \
                                        {ret_type_model_summary_str}\n \
                                        {ret_type_prediction_summary_str}\n \
                                        tf nr_of_args model summary:\n \
                                         {nr_of_args_model_summary_str}\n \
                                         {nr_of_args_prediction_summary_str}\n \
                                         tf arg_one model summary:\n \
                                         {self.model_summary_str}\n \
                                         {arg_one_prediction_summary_str}")

            self.resultReady.emit(f'{ret_type_biggest_prob_type} \
                 <span style=\"background-color:red;\">({ret_type_biggest_prob_percent:3.1f}%)</span> \
                 {current_func_name} ( \
                 {arg_one_biggest_prob_type} \
                 <span style=\"background-color:red;\">({arg_one_biggest_prob_percent:3.1f}%)</span> \
                 )')
               
            
            return
        else:
            self.updateProgressBar.emit(30)   
              
        ###if more one args
        ###predict now arg_two
        arg_two_prediction_summary_str = self.get_prediction('arg_two', 
                                                                disasm_caller_callee_predict_str, 
                                                                func_sign_prob_git_path)
           
   
        ## store for later, will be overridden
        arg_two_model_summary_str = self.model_summary_str
        arg_two_biggest_prob = self.biggest_prob
        arg_two_biggest_prob_type = self.biggest_prob_type
        arg_two_biggest_prob_percent = 100 * arg_two_biggest_prob
        
          
        #nr_of_args_biggest_prob_type = 2 
        if nr_of_args_biggest_prob_type == 2:
            
            self.updateProgressBar.emit(100)
            
            self.summaryReady.emit(f"tf return type model summary:\n \
                                        {ret_type_model_summary_str}\n \
                                        {ret_type_prediction_summary_str}\n \
                                        tf nr_of_args model summary:\n \
                                        {nr_of_args_model_summary_str}\n \
                                        {nr_of_args_prediction_summary_str}\n \
                                        tf arg_one model summary:\n \
                                        {arg_one_model_summary_str}\n \
                                        {arg_one_prediction_summary_str}\n \
                                        tf arg_two model summary:\n \
                                        {arg_two_model_summary_str}\n \
                                        {arg_two_prediction_summary_str}")

            self.resultReady.emit(f'{ret_type_biggest_prob_type} \
                <span style=\"background-color:red;\">({ret_type_biggest_prob_percent:3.1f}%)</span> \
                {current_func_name} ( \
                {arg_one_biggest_prob_type} \
                <span style=\"background-color:red;\">({arg_one_biggest_prob_percent:3.1f}%)</span> , \
                {arg_two_biggest_prob_type} \
                <span style=\"background-color:red;\">({arg_two_biggest_prob_percent:3.1f}%)</span> \
                )')
            
              
            self.set_stored_radare2_e()
            return
        else:
            self.updateProgressBar.emit(40)
         
        ###if more than two args
        ###predict now arg_three
        arg_three_prediction_summary_str = self.get_prediction('arg_three', 
                                                                disasm_caller_callee_predict_str, 
                                                                func_sign_prob_git_path)
          
  
        ## store for later, will be overridden
        arg_three_model_summary_str = self.model_summary_str
        arg_three_biggest_prob = self.biggest_prob
        arg_three_biggest_prob_type = self.biggest_prob_type
        arg_three_biggest_prob_percent = 100 * arg_three_biggest_prob
         
                                              
        #nr_of_args_biggest_prob_type = 3
        if nr_of_args_biggest_prob_type >= 3:   #hack, if more args, need more models
              
            self.updateProgressBar.emit(100)
            
            self.summaryReady.emit(f"tf return type model summary:\n \
                                        {ret_type_model_summary_str}\n \
                                        {ret_type_prediction_summary_str}\n \
                                        tf nr_of_args model summary:\n \
                                        {nr_of_args_model_summary_str}\n \
                                        {nr_of_args_prediction_summary_str}\n \
                                        tf arg_one model summary:\n \
                                        {arg_one_model_summary_str}\n \
                                        {arg_one_prediction_summary_str}\n \
                                        tf arg_two model summary:\n \
                                        {arg_two_model_summary_str}\n \
                                        {arg_two_prediction_summary_str}\n \
                                        tf arg_three model summary:\n \
                                        {arg_three_model_summary_str}\n \
                                        {arg_three_prediction_summary_str}")

            self.resultReady.emit(f'{ret_type_biggest_prob_type} \
                <span style=\"background-color:red;\">({ret_type_biggest_prob_percent:3.1f}%)</span> \
                {current_func_name} ( \
                {arg_one_biggest_prob_type} \
                <span style=\"background-color:red;\">({arg_one_biggest_prob_percent:3.1f}%)</span> , \
                {arg_two_biggest_prob_type} \
                <span style=\"background-color:red;\">({arg_two_biggest_prob_percent:3.1f}%)</span> , \
                {arg_three_biggest_prob_type} \
                <span style=\"background-color:red;\">({arg_three_biggest_prob_percent:3.1f}%)</span> \
                )')
              
            self.set_stored_radare2_e()
            return
        else:
            self.updateProgressBar.emit(50)
         
        #for debug
        print('over')
         
        self.set_stored_radare2_e()
#         /* ... here is the expensive or blocking operation ... */
        self.resultReady.emit('')
Esempio n. 4
0
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    if (len(elem2[4]) >
                                        (int(config[
                                            'tokenized_disassembly_length']) /
                                         2)) or (len(att_dis) > (int(config[
                                             'tokenized_disassembly_length']) /
                                                                 2)
                                                 ) or (len(elem2[4]) < 1) or (
                                                     len(att_dis) < 1):
                                        continue

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])

                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    #                                     if return_type == 'unknown':
                                    #                                         print(f'string_before_func_name: {return_type_func_sign}')
                                    #
                                    #                                     if return_type == 'unknown':
                                    #                                         #print('unknown found')
                                    #                                         #breaker = True
                                    #                                         #break
                                    #                                         pass
                                    #                                     elif return_type == 'delete':
                                    #                                         #print('delete found')
                                    #                                         ### no return type found, so delete this item
                                    #                                         pass
                                    #                                     elif return_type == 'process_further':
                                    #                                         print(f'ERRROOOORRRR---------------')
                                    if nr_of_args == -1:
                                        print(f'Error nr_of_args')
                                    else:
                                        print(f'nr_of_args >{nr_of_args}<',
                                              end='\r')

                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)
                                        #print(f'dis1_str >{dis1_str}<')

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                 ) / 2)
                                            ) or (len(dis2_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                       ) / 2)
                                                  ) or (len(dis1_str) < 1) or (
                                                      len(dis2_str) < 1):
                                            print(
                                                f'tokenized_disassembly_length caller >{len(dis1_str)}<'
                                            )
                                            print(
                                                f'tokenized_disassembly_length callee >{len(dis2_str)}<'
                                            )
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, nr_of_args))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter