def chinese_word_cut_tf(input_str): """""" main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join(main_root, "tools/cppjieba/dict/stop_words.utf8") dict_lines = read_lines_from_text_file(dict_path) model_lines = read_lines_from_text_file(hmm_path) user_dict_lines = read_lines_from_text_file(user_dict_path) idf_lines = read_lines_from_text_file(idf_path) stop_word_lines = read_lines_from_text_file(stop_word_path) output_str = py_x_ops.jieba_cut(input_str, use_file=False, hmm=True, dict_lines=dict_lines, model_lines=model_lines, user_dict_lines=user_dict_lines, idf_lines=idf_lines, stop_word_lines=stop_word_lines) return output_str
def pre_process_text(input_texts, language, split_by_space, use_word): """Text pre-processing before tokenize.""" if language == "english": batch = clean_english_str_tf(input_texts) else: if split_by_space: batch = input_texts else: if use_word: main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join( main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join( main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join( main_root, "tools/cppjieba/dict/stop_words.utf8") batch = py_x_ops.jieba_cut(input_texts, hmm=True, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) else: batch = char_cut_tf(input_texts) return batch
def build_op_no_file(self, sentence): ''' build graph ''' main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join(main_root, "tools/cppjieba/dict/stop_words.utf8") dict_lines = read_lines_from_text_file(dict_path) model_lines = read_lines_from_text_file(hmm_path) user_dict_lines = read_lines_from_text_file(user_dict_path) idf_lines = read_lines_from_text_file(idf_path) stop_word_lines = read_lines_from_text_file(stop_word_path) words = py_x_ops.jieba_cut(sentence, use_file=False, hmm=True, dict_lines=dict_lines, model_lines=model_lines, user_dict_lines=user_dict_lines, idf_lines=idf_lines, stop_word_lines=stop_word_lines) return words
def build_op(self, sentence): ''' build graph ''' main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join(main_root, "tools/cppjieba/dict/stop_words.utf8") words = py_x_ops.jieba_cut(sentence, hmm=True, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) return words
def pre_process_pipeline(self, input_sentences): """Data pipeline function for pre-processing.""" language = self.task_config["language"] clean_english = self.task_config.get("clean_english", False) split_by_space = self.task_config.get("split_by_space", False) use_word = self.task_config.get("use_word", False) if language == "english": if clean_english: batch = clean_english_str_tf(input_sentences) else: batch = input_sentences else: if split_by_space: batch = input_sentences else: if use_word: main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join( main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join( main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join( main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join( main_root, "tools/cppjieba/dict/stop_words.utf8") batch = py_x_ops.jieba_cut(input_sentences, hmm=True, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) else: batch = char_cut_tf(input_sentences) return batch
def chinese_word_cut_tf(input_str, use_file=False): """""" output_str = py_x_ops.jieba_cut(input_str, use_file=use_file, hmm=True) return output_str
def build_op_no_file(self, sentence): ''' build graph ''' words = py_x_ops.jieba_cut(sentence, use_file=False, hmm=True) return words