Python Tokenizer.word_indexの例

プログラミング言語: Python

名前空間/パッケージ名: helpers.Tokenizer

クラス/型: Tokenizer

メソッド/関数: word_index

hotexamples.comのコード掲載数: 3

Python Tokenizer.word_index - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのhelpers.Tokenizer.Tokenizer.word_indexの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Tokenizer(6)

texts_to_sequences(6)

num_words(3)

word_index(3)

fit_on_texts(2)

コード例 #1

ファイルを表示

ファイル: WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py プロジェクト: ml-nic/NMT-PA

    def predict_one_sentence(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence

コード例 #2

ファイルを表示

ファイル: WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py プロジェクト: ml-nic/NMT-PA

    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction

コード例 #3

ファイルを表示

ファイル: WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py プロジェクト: ml-nic/NMT-PA

    def predict_batch(self, sentences):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentences)
        sentences = en_tokenizer.texts_to_sequences(sentences)
        print(sentences)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print(
                            "second best prediction is ",
                            reverse_word_index[np.argmax(
                                np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo accept not multiple of batchsize
                break
        return predicted_sentences