def chinese_word_cut_tf(input_str): """""" main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join(main_root, "tools/cppjieba/dict/stop_words.utf8") dict_lines = read_lines_from_text_file(dict_path) model_lines = read_lines_from_text_file(hmm_path) user_dict_lines = read_lines_from_text_file(user_dict_path) idf_lines = read_lines_from_text_file(idf_path) stop_word_lines = read_lines_from_text_file(stop_word_path) output_str = py_x_ops.jieba_cut(input_str, use_file=False, hmm=True, dict_lines=dict_lines, model_lines=model_lines, user_dict_lines=user_dict_lines, idf_lines=idf_lines, stop_word_lines=stop_word_lines) return output_str
def build_op_no_file(self, sentence): ''' build graph ''' main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join(main_root, "tools/cppjieba/dict/stop_words.utf8") dict_lines = read_lines_from_text_file(dict_path) model_lines = read_lines_from_text_file(hmm_path) user_dict_lines = read_lines_from_text_file(user_dict_path) idf_lines = read_lines_from_text_file(idf_path) stop_word_lines = read_lines_from_text_file(stop_word_path) words = py_x_ops.jieba_cut(sentence, use_file=False, hmm=True, dict_lines=dict_lines, model_lines=model_lines, user_dict_lines=user_dict_lines, idf_lines=idf_lines, stop_word_lines=stop_word_lines) return words
def tokenize_sentence(texts, max_seq_len, vocab_path): """Tokenize sentence""" vocabs = read_lines_from_text_file(vocab_path) token_ids, _ = py_x_ops.sentence_to_ids(texts, maxlen=max_seq_len, use_vocab_file=False, vocab=vocabs, load_token_ids_from_vocab=True, pad_id=utils.PAD_IDX, check_tokens=False) return token_ids
def tokenize_label(label, maxlen, label_vocab_file_path, pad_id): """Tokenize labels""" vocabs = read_lines_from_text_file(label_vocab_file_path) label_id, _ = py_x_ops.sentence_to_ids(label, maxlen=maxlen, use_vocab_file=False, vocab=vocabs, load_token_ids_from_vocab=True, pad_id=pad_id, check_tokens=False) return label_id
def jieba_cut(input_sentence, use_file=True, hmm=True): dict_path = os.path.join(PACKAGE_ROOT_DIR, "./resources/cppjieba_dict/jieba.dict.utf8") hmm_path = os.path.join(PACKAGE_ROOT_DIR, "./resources/cppjieba_dict/hmm_model.utf8") user_dict_path = os.path.join(PACKAGE_ROOT_DIR, "./resources/cppjieba_dict/user.dict.utf8") idf_path = os.path.join(PACKAGE_ROOT_DIR, "./resources/cppjieba_dict/idf.utf8") stop_word_path = os.path.join(PACKAGE_ROOT_DIR, "./resources/cppjieba_dict/stop_words.utf8") if use_file: output_sentence = gen_x_ops.jieba_cut( input_sentence, use_file=use_file, hmm=hmm, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) else: dict_lines = read_lines_from_text_file(dict_path) model_lines = read_lines_from_text_file(hmm_path) user_dict_lines = read_lines_from_text_file(user_dict_path) idf_lines = read_lines_from_text_file(idf_path) stop_word_lines = read_lines_from_text_file(stop_word_path) output_sentence = gen_x_ops.jieba_cut( input_sentence, use_file=use_file, hmm=hmm, dict_lines=dict_lines, model_lines=model_lines, user_dict_lines=user_dict_lines, idf_lines=idf_lines, stop_word_lines=stop_word_lines) return output_sentence