def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list)) for pre, post in pre2post.items(): ifd = IdFreqDict() ifd.load_dict(pre) pre_vocab = ifd.vocabulary_size() print('{} loaded, {} words'.format(pre, pre_vocab)) ifd.drop_words_by_condition(word_remove) print('{} words dropped, remain {} words'.format( pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size())) ifd.dump_dict(post) print('dump over')