Beispiel #1
0
    def __create_vocab(self):
        en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.fit_on_texts(self.train_input_texts + self.val_input_texts)
        self.train_input_texts = en_tokenizer.texts_to_sequences(self.train_input_texts)
        self.train_input_texts = pad_sequences(self.train_input_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                               padding='post',
                                               truncating='post')
        self.__insert_valid_token_at_last_position(self.train_input_texts)

        self.val_input_texts = en_tokenizer.texts_to_sequences(self.val_input_texts)
        self.val_input_texts = pad_sequences(self.val_input_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                             padding='post',
                                             truncating='post')
        self.__insert_valid_token_at_last_position(self.val_input_texts)
        self.en_word_index = en_tokenizer.word_index

        de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.fit_on_texts(self.train_target_texts + self.val_target_texts)
        self.train_target_texts = de_tokenizer.texts_to_sequences(self.train_target_texts)
        self.train_target_texts = pad_sequences(self.train_target_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                                padding='post',
                                                truncating='post')
        self.__insert_valid_token_at_last_position(self.train_target_texts)
        self.val_target_texts = de_tokenizer.texts_to_sequences(self.val_target_texts)
        self.val_target_texts = pad_sequences(self.val_target_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                              padding='post',
                                              truncating='post')
        self.__insert_valid_token_at_last_position(self.val_target_texts)
        self.de_word_index = de_tokenizer.word_index

        embeddings_index = {}
        filename = self.PRETRAINED_GLOVE_FILE
        with open(filename, 'r', encoding='utf8') as f:
            for line in f.readlines():
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(embeddings_index))

        self.num_train_words = self.params['MAX_WORDS_EN'] + 3
        self.en_embedding_matrix = np.zeros((self.num_train_words, self.params['EMBEDDING_DIM']))
        for word, i in self.en_word_index.items():
            if i >= self.params['MAX_WORDS_EN'] + 3:
                continue
            embedding_vector = None
            if word == self.START_TOKEN:
                embedding_vector = self.START_TOKEN_VECTOR
            elif word == self.END_TOKEN:
                embedding_vector = self.END_TOKEN_VECTOR
            elif word == self.UNK_TOKEN:
                embedding_vector = self.UNK_TOKEN_VECTOR
            else:
                embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                embedding_vector = self.UNK_TOKEN_VECTOR
            self.en_embedding_matrix[i] = embedding_vector
Beispiel #2
0
            print(data[i], file=test, end='')

exit()
input_texts = []
target_texts = []
lines = open('../../DataSets/Training/deu.txt',
             encoding='UTF-8').read().split('\n')
for line in lines:
    input_text, target_text = line.split('\t')
    input_texts.append(input_text)
    target_text = target_text
    target_texts.append(target_text)
num_samples = len(input_texts)

en_tokenizer = Tokenizer("GO_", "_EOS", "_UNK", num_words=30000)
en_tokenizer.fit_on_texts(input_texts)
train_input_texts = en_tokenizer.texts_to_sequences(input_texts)
lengths = []
for text in train_input_texts:
    lengths.append(len(text))
print(len(lengths))
import numpy as np

print(np.max(np.array(lengths)))
train_input_texts = pad_sequences(train_input_texts,
                                  maxlen=100,
                                  padding='post',
                                  truncating='post')
en_word_index = en_tokenizer.word_index

exit()