def predict_one_sentence(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence]) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) prediction = self.M.predict(sentence) predicted_sentence = "" reverse_word_index = dict( (i, word) for word, i in self.de_word_index.items()) for sentence in prediction: for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print( "second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sentence += next_word + " " return predicted_sentence
def calculate_hiddenstate_after_encoder(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence]) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) encoder_name = 'encoder' encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output) prediction = encoder.predict(sentence, batch_size=1) print(prediction.shape) return prediction
def predict_batch(self, sentences): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentences) sentences = en_tokenizer.texts_to_sequences(sentences) print(sentences) sentences = pad_sequences(sentences, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentences = sentences.reshape(sentences.shape[0], sentences.shape[1]) batch_size = sentences.shape[0] if batch_size > 10: batch_size = 10 reverse_word_index = dict( (i, word) for word, i in self.de_word_index.items()) predicted_sentences = [] from_idx = 0 to_idx = batch_size while True: print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0]) current_batch = sentences[from_idx:to_idx] prediction = self.M.predict(current_batch, batch_size=batch_size) for sentence in prediction: predicted_sent = "" for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print( "second best prediction is ", reverse_word_index[np.argmax( np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sent += next_word + " " predicted_sentences.append(predicted_sent) from_idx += batch_size to_idx += batch_size if to_idx > sentences.shape[0]: # todo accept not multiple of batchsize break return predicted_sentences