Example #1
0
 def test_clean_english_str_tf(self):
   t_sentence_in = tf.placeholder(dtype=tf.string)
   t_sentence_out = clean_english_str_tf(t_sentence_in)
   with self.cached_session(use_gpu=False, force_gpu=False) as sess:
     sentence_out = sess.run(t_sentence_out,
                             {t_sentence_in: "I'd like to have an APPLE! "})
     logging.info(sentence_out)
     self.assertEqual("i 'd like to have an apple !",
                      sentence_out.decode("utf-8"))
     sentence_out = sess.run(t_sentence_out,
                             {t_sentence_in: ["I'd like to have an APPLE! "]})
     logging.info(sentence_out)
     self.assertEqual("i 'd like to have an apple !",
                      sentence_out[0].decode("utf-8"))
Example #2
0
  def pre_process_pipeline(self, input_sentences):
    """Data pipeline function for pre-processing."""
    language = self.task_config["language"]
    clean_english = self.task_config.get("clean_english", False)
    split_by_space = self.task_config.get("split_by_space", False)
    use_word = self.task_config.get("use_word", False)

    if language == "english":
      if clean_english:
        batch = clean_english_str_tf(input_sentences)
      else:
        batch = input_sentences
    else:
      if split_by_space:
        batch = input_sentences
      else:
        if use_word:
          batch = chinese_word_cut_tf(input_sentences)
        else:
          batch = char_cut_tf(input_sentences)
    return batch
Example #3
0
    def pre_process_pipeline(self, input_sentences):
        """Data pipeline function for pre-processing."""
        language = self.task_config["language"]
        clean_english = self.task_config.get("clean_english", False)
        split_by_space = self.task_config.get("split_by_space", False)
        use_word = self.task_config.get("use_word", False)

        if language == "english":
            if clean_english:
                batch = clean_english_str_tf(input_sentences)
            else:
                batch = input_sentences
        else:
            if split_by_space:
                batch = input_sentences
            else:
                if use_word:
                    main_root = os.environ["MAIN_ROOT"]
                    dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/jieba.dict.utf8")
                    hmm_path = os.path.join(
                        main_root, "tools/cppjieba/dict/hmm_model.utf8")
                    user_dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/user.dict.utf8")
                    idf_path = os.path.join(main_root,
                                            "tools/cppjieba/dict/idf.utf8")
                    stop_word_path = os.path.join(
                        main_root, "tools/cppjieba/dict/stop_words.utf8")
                    batch = py_x_ops.jieba_cut(input_sentences,
                                               hmm=True,
                                               dict_path=dict_path,
                                               hmm_path=hmm_path,
                                               user_dict_path=user_dict_path,
                                               idf_path=idf_path,
                                               stop_word_path=stop_word_path)
                else:
                    batch = char_cut_tf(input_sentences)
        return batch