Ejemplo n.º 1
0
class InputEngineRnn:

    def __init__(self, graph_file, vocab_path, config_name, use_phrase=False):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self.use_phrase = use_phrase
        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters,
                                         vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase)
        print("in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
              "\nphrase vocabulary size = %d" % (
                self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter,
                self._config.vocab_size_phrase))
        
        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"
        self.max_test_line = 10000

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        inputs, inputs_key, word_letters = self._data_utility.sentence2ids(sentence)

        lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
        kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)
        words_out = list()
        phrase_logits = None
        # Phase I: read contexts.
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state_out
                    # Use previous language model's final state as language model's initial state.
                if self.use_phrase:
                    lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name,
                                                                                                 self.phrase_p_name,
                                                                                                 self.phrase_p_probability,
                                                                                                 self.phrase_logits],
                                                                                                 feed_dict=feed_values)
                    phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                    probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
                else:
                    lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0]

        # Phase II: read letters, predict by feed the letters one-by-one.
        for i in range(len(inputs_key)):
            feed_values = {self.kc_input_name: [[inputs_key[i]]],
                           self.kc_top_k_name: k}
            if i == 0 and len(inputs) > 0:
                feed_values[self.kc_lm_state_in_name] = lm_state_out
                # Use language model's final state to letter model's initial state when the letters haven't been feed.
            else:
                feed_values[self.kc_state_in_name] = kc_state_out
                # Use letter model's final state to letter model's initial state when feed the letters one-by-one.
            probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
                                                                             self.kc_state_out_name], feed_dict=feed_values)
            probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
            words_out = self._data_utility.ids2outwords(top_k_predictions[0])
            # Predict phrase
            if self.use_phrase:
                if i == 0 and len(inputs) > 0:
                    top_word = words_out[0]
                    top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
                    if top_phrase[0] is not None:
                        is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
                        words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk)

        return [{'word': word, 'probability': float(probability)}
                if word != '<unk>' else {'word': '<' + word_letters + '>', 'probability': float(probability)}
                for word, probability in zip(words_out, probability_topk)] if len(words_out) > 0 else []

    def predict_data(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        sentence = sentence.rstrip()
        res = self._data_utility.data2ids_line(sentence)
        if res is None:
            return None
        words_line, letters_line, words_ids, letters_ids, words_num, letters_num = res

        out_str_list = []
        probability_topk_list = []
        phrase_logits = None

        lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
        kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)

        for i in range(len(words_ids)):
            words_out = []
            probs_out = []
            # Phase I: read contexts.
            feed_values = {self.lm_input_name: [[words_ids[i]]]}
            if i > 0:
                feed_values[self.lm_state_in_name] = lm_state_out
                # Use previous language model's final state as language model's initial state.
            if self.use_phrase:
                lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
                    [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability,
                     self.phrase_logits], feed_dict=feed_values)
                phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
            else:
                lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0]

            if i == len(letters_ids):
                break
            # Phase II: read letters, predict by feed the letters one-by-one.
            for j in range(len(letters_ids[i])):
                feed_values = {self.kc_input_name: [[letters_ids[i][j]]],
                               self.kc_top_k_name: k, self.key_length: [1]}

                if j == 0 and len(words_ids) > 0:
                    feed_values[self.kc_lm_state_in_name] = lm_state_out
                    # Use language model's final state to letter model's initial state when letters haven't been feed.
                else:
                    feed_values[self.kc_state_in_name] = kc_state_out
                    # Use letter model's final state to letter model's initial state when feed the letters one-by-one.
                probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
                                                                                 self.kc_state_out_name], feed_dict=feed_values)
                probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
                words = self._data_utility.ids2outwords(top_k_predictions[0])

                # Predict phrase
                if self.use_phrase:
                    if j == 0 and i > 0:
                        top_word = words[0]
                        top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
                        if top_phrase[0] is not None:
                            is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
                            words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk)
                words_out.append(words)
                probs_out.append(probability_topk)
            out_str = words_out if i > 0 else [['','','']] + words_out[1: ]
            out_str_list.append(out_str)
            probability_topk_list.append(probs_out)

        return words_line, letters_line, out_str_list, probability_topk_list

    def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k):
        is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)]
        phrase_p = is_phrase_p * top_phrase[1]
        return is_phrase_p, phrase_p

    def final_words_out(self, words, top_phrase, phrase_p, probability_topk):
        for i in range(len(probability_topk)):
            if phrase_p >= probability_topk[i]:
                probability_topk[i] = phrase_p
                words[i] = top_phrase[0]
                break
        return words, probability_topk

    def result_print(self, out_string, out_prob):
        string = ""
        for (word, prob) in zip(out_string, out_prob):
            prob = str(prob) if word != "" else "0.0"
            string = string + word + ":" + prob + "|"
        string = string[:-1]
        return string

    def predict_file(self, test_file_in, test_file_out, k):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()

        line_count = 0
        for sentence in testfilein:
            line_count += 1
            if line_count > self.max_test_line:
                break
            sentence = sentence.rstrip()
            result = self.predict_data(sentence, k)

            if result is not None:
                words_line, letters_line, out_words_list, out_prob_list = result

                for i in range(len(out_words_list)):
                    print("\t".join(words_line[:i])
                         + "|#|" + " ".join(letters_line[i])
                         + "|#|" + "\t".join(words_line[i:]) + "|#|"
                          + '\t'.join([self.result_print(out_words, out_prob)
                                       for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
                          + "\n")
                    testfileout.write("\t".join(words_line[:i])
                                      + "|#|" + " ".join(letters_line[i])
                                      + "|#|" + "\t".join(words_line[i:]) + "|#|"
                                      + '\t'.join([self.result_print(out_words, out_prob)
                                      for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
                                      + "\n")

        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()
Ejemplo n.º 2
0
class InputEngineRnn:
    def __init__(self, graph_file, vocab_path, config_name):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            vocab_file_phrase=vocab_file_phrase)
        print(
            "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
            "\nphrase vocabulary size = %d" %
            (self._config.vocab_size_in, self._config.vocab_size_out,
             self._config.vocab_size_letter, self._config.vocab_size_phrase))

        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"
        # self.lm_top_k_name = prefix + "Online/WordModel/top_k:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        inputs, inputs_key, word_letters = self._data_utility.sentence2ids(
            sentence)
        # print(inputs)
        # print(inputs_key)
        lm_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)
        words_out = list()
        phrase_logits = None
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state_out
                lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
                    [
                        self.lm_state_out_name, self.phrase_p_name,
                        self.phrase_p_probability, self.phrase_logits
                    ],
                    feed_dict=feed_values)
                phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                probability_p_topk = [
                    phrase_p_prob[0][id] for id in phrase_p_top_k
                ]

        for i in range(len(inputs_key)):
            feed_values = {
                self.kc_input_name: [[inputs_key[i]]],
                self.kc_top_k_name: k
            }
            if i == 0 and len(inputs) > 0:
                feed_values[self.kc_lm_state_in_name] = lm_state_out
            else:
                feed_values[self.kc_state_in_name] = kc_state_out
            probabilities, top_k_predictions, kc_state_out = self._sess.run(
                [
                    self.kc_output_name, self.kc_top_k_prediction_name,
                    self.kc_state_out_name
                ],
                feed_dict=feed_values)
            probability_topk = [
                probabilities[0][id] for id in top_k_predictions[0]
            ]
            words_out = self._data_utility.ids2outwords(top_k_predictions[0])
            if i == 0 and len(inputs) > 0:
                top_word = words_out[0]
                top_phrase = self._data_utility.get_top_phrase(
                    phrase_logits, top_word)
                if top_phrase[0] is not None:
                    is_phrase_p, phrase_p = self.calculate_phrase_p(
                        top_phrase, probability_p_topk, phrase_p_top_k)
                    words_out, probability_topk = self.final_words_out(
                        words_out, top_phrase, phrase_p, probability_topk)

        return [{
            'word': word,
            'probability': float(probability)
        } if word != '<unk>' else {
            'word': '<' + word_letters + '>',
            'probability': float(probability)
        } for word, probability in zip(words_out, probability_topk)
                ] if len(words_out) > 0 else []

    # def predict_data(self, sentence, k):
    #     global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
    #     sentence = sentence.rstrip()
    #     words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line(sentence)
    #     out_str_list = []
    #     probability_topk_list = []
    #     # print(words_ids)
    #     # print(letters_ids)
    #     lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
    #     kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)
    #
    #     for i in range(len(words_ids)):
    #         words_out = []
    #         probs_out = []
    #         feed_values = {self.lm_input_name: [[words_ids[i]]]}
    #         if i > 0:
    #             feed_values[self.lm_state_in_name] = lm_state_out
    #
    #         lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
    #             [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability,
    #              self.phrase_logits], feed_dict=feed_values)
    #         phrase_p_top_k = [id for id in phrase_p_top_k[0]]
    #         probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
    #
    #         if i == len(letters_ids):
    #             break
    #         for j in range(len(letters_ids[i])):
    #             feed_values = {self.kc_input_name: [[letters_ids[i][j]]],
    #                            self.kc_top_k_name: k, self.key_length:[1]}
    #
    #             if j == 0 and len(words_ids) > 0:
    #                 feed_values[self.kc_lm_state_in_name] = lm_state_out
    #             else:
    #                 feed_values[self.kc_state_in_name] = kc_state_out
    #             probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
    #                                                                           self.kc_state_out_name], feed_dict=feed_values)
    #             probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
    #             words = self._data_utility.ids2outwords(top_k_predictions[0])
    #
    #             if j == 0 and i > 0:
    #                 top_word = words[0]
    #                 top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
    #                 if top_phrase[0] is not None:
    #                     is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
    #                     words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk)
    #             words_out.append(words)
    #             probs_out.append(probability_topk)
    #         out_str = words_out if i > 0 else [['','','']] + words_out[1: ]
    #         out_str_list.append(out_str)
    #         probability_topk_list.append(probs_out)
    #
    #     return words_line, letters_line, out_str_list, probability_topk_list

    def calculate_phrase_p(self, top_phrase, probability_p_topk,
                           phrase_p_top_k):
        is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)]
        phrase_p = is_phrase_p * top_phrase[1]
        return is_phrase_p, phrase_p

    def final_words_out(self, words, top_phrase, phrase_p, probability_topk):
        for i in range(len(probability_topk)):
            if phrase_p >= probability_topk[i]:
                probability_topk[i] = phrase_p
                words[i] = top_phrase[0]
                break
        return words, probability_topk

    def result_print(self, out_string, out_prob):
        string = ""
        for (word, prob) in zip(out_string, out_prob):
            prob = str(prob) if word != "" else "0.0"
            string = string + word + ":" + prob + "|"
        string = string[:-1]
        return string

    # def predict_file(self, test_file_in, test_file_out, k):
    #     testfilein = open(test_file_in, "r")
    #     testfileout = open(test_file_out, 'w')
    #     t1 = time.time()
    #
    #     for sentence in testfilein:
    #         sentence = sentence.rstrip()
    #         result = self.predict_data(sentence, k)
    #
    #         if result is not None:
    #             words_line, letters_line, out_words_list, out_prob_list = result
    #
    #             for i in range(len(out_words_list)):
    #                 print("\t".join(words_line[:i])
    #                      + "|#|" + letters_line[i]
    #                      + "|#|" + "\t".join(words_line[i:]) + "|#|"
    #                       + '\t'.join([self.result_print(out_words, out_prob)
    #                                    for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
    #                       + "\n")
    #                 testfileout.write("\t".join(words_line[:i])
    #                                   + "|#|" + letters_line[i]
    #                                   + "|#|" + "\t".join(words_line[i:]) + "|#|"
    #                                   + '\t'.join([self.result_print(out_words, out_prob)
    #                                         for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
    #                                   + "\n")
    #
    #     t2 = time.time()
    #     print(t2 - t1)
    #     testfilein.close()
    #     testfileout.close()

    def predict_data(self, sentence, k):
        sentence = sentence.rstrip()
        inputs, inputs_key, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)  #上下文的id,要预测的单词的键码部分id,上下文单词数,要预测的单词的字母数
        words_out = []
        lm_state = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state
                # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size
                # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size
                result = self._sess.run([self.lm_state_out_name],
                                        feed_dict=feed_values)
                lm_state = result[0]
                #probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
                #words = self._data_utility.ids2outwords(top_k_predictions[0])
                #words_out.append(words)

        for i in range(len(inputs_key)):
            feed_values = {
                self.kc_input_name: [[inputs_key[i]]],
                self.kc_top_k_name: k
            }
            if i > 0 or len(inputs) == 0:
                feed_values[self.kc_state_in_name] = kc_state
            else:
                feed_values[self.kc_lm_state_in_name] = lm_state
            #print (state_out)
            probabilities, top_k_predictions, kc_state = self._sess.run(
                [
                    self.kc_output_name, self.kc_top_k_prediction_name,
                    self.kc_state_out_name
                ],
                feed_dict=feed_values)
            probability_topk = [
                probabilities[0][id] for id in top_k_predictions[0]
            ]
            words = self._data_utility.ids2outwords(top_k_predictions[0])
            words_out.append(words)
        out_str = str(words_out if words_num > 0 else [['', '', '']] +
                      words_out[1:])
        return out_str

    def predict_file(self, test_file_in, test_file_out, k):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        topk = k
        for sentence in testfilein:
            sentence = sentence.rstrip()
            sentence_in = sentence.lower()
            out_str = self.predict_data(sentence_in, topk)
            if (out_str):
                print(sentence + " | " + out_str)
                testfileout.write(sentence + " | " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()