Esempio n. 1
0
def chinese_word_cut_tf(input_str):
    """"""
    main_root = os.environ["MAIN_ROOT"]
    dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8")
    hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8")
    user_dict_path = os.path.join(main_root,
                                  "tools/cppjieba/dict/user.dict.utf8")
    idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
    stop_word_path = os.path.join(main_root,
                                  "tools/cppjieba/dict/stop_words.utf8")
    dict_lines = read_lines_from_text_file(dict_path)
    model_lines = read_lines_from_text_file(hmm_path)
    user_dict_lines = read_lines_from_text_file(user_dict_path)
    idf_lines = read_lines_from_text_file(idf_path)
    stop_word_lines = read_lines_from_text_file(stop_word_path)

    output_str = py_x_ops.jieba_cut(input_str,
                                    use_file=False,
                                    hmm=True,
                                    dict_lines=dict_lines,
                                    model_lines=model_lines,
                                    user_dict_lines=user_dict_lines,
                                    idf_lines=idf_lines,
                                    stop_word_lines=stop_word_lines)
    return output_str
Esempio n. 2
0
def pre_process_text(input_texts, language, split_by_space, use_word):
    """Text pre-processing before tokenize."""
    if language == "english":
        batch = clean_english_str_tf(input_texts)
    else:
        if split_by_space:
            batch = input_texts
        else:
            if use_word:
                main_root = os.environ["MAIN_ROOT"]
                dict_path = os.path.join(
                    main_root, "tools/cppjieba/dict/jieba.dict.utf8")
                hmm_path = os.path.join(main_root,
                                        "tools/cppjieba/dict/hmm_model.utf8")
                user_dict_path = os.path.join(
                    main_root, "tools/cppjieba/dict/user.dict.utf8")
                idf_path = os.path.join(main_root,
                                        "tools/cppjieba/dict/idf.utf8")
                stop_word_path = os.path.join(
                    main_root, "tools/cppjieba/dict/stop_words.utf8")
                batch = py_x_ops.jieba_cut(input_texts,
                                           hmm=True,
                                           dict_path=dict_path,
                                           hmm_path=hmm_path,
                                           user_dict_path=user_dict_path,
                                           idf_path=idf_path,
                                           stop_word_path=stop_word_path)
            else:
                batch = char_cut_tf(input_texts)
    return batch
Esempio n. 3
0
    def build_op_no_file(self, sentence):
        ''' build graph '''
        main_root = os.environ["MAIN_ROOT"]

        dict_path = os.path.join(main_root,
                                 "tools/cppjieba/dict/jieba.dict.utf8")
        hmm_path = os.path.join(main_root,
                                "tools/cppjieba/dict/hmm_model.utf8")
        user_dict_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/user.dict.utf8")
        idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
        stop_word_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/stop_words.utf8")

        dict_lines = read_lines_from_text_file(dict_path)
        model_lines = read_lines_from_text_file(hmm_path)
        user_dict_lines = read_lines_from_text_file(user_dict_path)
        idf_lines = read_lines_from_text_file(idf_path)
        stop_word_lines = read_lines_from_text_file(stop_word_path)

        words = py_x_ops.jieba_cut(sentence,
                                   use_file=False,
                                   hmm=True,
                                   dict_lines=dict_lines,
                                   model_lines=model_lines,
                                   user_dict_lines=user_dict_lines,
                                   idf_lines=idf_lines,
                                   stop_word_lines=stop_word_lines)
        return words
Esempio n. 4
0
    def build_op(self, sentence):
        ''' build graph '''
        main_root = os.environ["MAIN_ROOT"]
        dict_path = os.path.join(main_root,
                                 "tools/cppjieba/dict/jieba.dict.utf8")
        hmm_path = os.path.join(main_root,
                                "tools/cppjieba/dict/hmm_model.utf8")
        user_dict_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/user.dict.utf8")
        idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
        stop_word_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/stop_words.utf8")

        words = py_x_ops.jieba_cut(sentence,
                                   hmm=True,
                                   dict_path=dict_path,
                                   hmm_path=hmm_path,
                                   user_dict_path=user_dict_path,
                                   idf_path=idf_path,
                                   stop_word_path=stop_word_path)
        return words
Esempio n. 5
0
    def pre_process_pipeline(self, input_sentences):
        """Data pipeline function for pre-processing."""
        language = self.task_config["language"]
        clean_english = self.task_config.get("clean_english", False)
        split_by_space = self.task_config.get("split_by_space", False)
        use_word = self.task_config.get("use_word", False)

        if language == "english":
            if clean_english:
                batch = clean_english_str_tf(input_sentences)
            else:
                batch = input_sentences
        else:
            if split_by_space:
                batch = input_sentences
            else:
                if use_word:
                    main_root = os.environ["MAIN_ROOT"]
                    dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/jieba.dict.utf8")
                    hmm_path = os.path.join(
                        main_root, "tools/cppjieba/dict/hmm_model.utf8")
                    user_dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/user.dict.utf8")
                    idf_path = os.path.join(main_root,
                                            "tools/cppjieba/dict/idf.utf8")
                    stop_word_path = os.path.join(
                        main_root, "tools/cppjieba/dict/stop_words.utf8")
                    batch = py_x_ops.jieba_cut(input_sentences,
                                               hmm=True,
                                               dict_path=dict_path,
                                               hmm_path=hmm_path,
                                               user_dict_path=user_dict_path,
                                               idf_path=idf_path,
                                               stop_word_path=stop_word_path)
                else:
                    batch = char_cut_tf(input_sentences)
        return batch
Esempio n. 6
0
def chinese_word_cut_tf(input_str, use_file=False):
    """"""

    output_str = py_x_ops.jieba_cut(input_str, use_file=use_file, hmm=True)
    return output_str
Esempio n. 7
0
 def build_op_no_file(self, sentence):
     ''' build graph '''
     words = py_x_ops.jieba_cut(sentence, use_file=False, hmm=True)
     return words