Beispiel #1
0
    def __create_vocab(self):
        en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.fit_on_texts(self.train_input_texts + self.val_input_texts)
        self.train_input_texts = en_tokenizer.texts_to_sequences(self.train_input_texts)
        self.train_input_texts = pad_sequences(self.train_input_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                               padding='post',
                                               truncating='post')
        self.__insert_valid_token_at_last_position(self.train_input_texts)

        self.val_input_texts = en_tokenizer.texts_to_sequences(self.val_input_texts)
        self.val_input_texts = pad_sequences(self.val_input_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                             padding='post',
                                             truncating='post')
        self.__insert_valid_token_at_last_position(self.val_input_texts)
        self.en_word_index = en_tokenizer.word_index

        de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.fit_on_texts(self.train_target_texts + self.val_target_texts)
        self.train_target_texts = de_tokenizer.texts_to_sequences(self.train_target_texts)
        self.train_target_texts = pad_sequences(self.train_target_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                                padding='post',
                                                truncating='post')
        self.__insert_valid_token_at_last_position(self.train_target_texts)
        self.val_target_texts = de_tokenizer.texts_to_sequences(self.val_target_texts)
        self.val_target_texts = pad_sequences(self.val_target_texts, maxlen=self.params['MAX_SEQ_LEN'],
                                              padding='post',
                                              truncating='post')
        self.__insert_valid_token_at_last_position(self.val_target_texts)
        self.de_word_index = de_tokenizer.word_index

        embeddings_index = {}
        filename = self.PRETRAINED_GLOVE_FILE
        with open(filename, 'r', encoding='utf8') as f:
            for line in f.readlines():
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(embeddings_index))

        self.num_train_words = self.params['MAX_WORDS_EN'] + 3
        self.en_embedding_matrix = np.zeros((self.num_train_words, self.params['EMBEDDING_DIM']))
        for word, i in self.en_word_index.items():
            if i >= self.params['MAX_WORDS_EN'] + 3:
                continue
            embedding_vector = None
            if word == self.START_TOKEN:
                embedding_vector = self.START_TOKEN_VECTOR
            elif word == self.END_TOKEN:
                embedding_vector = self.END_TOKEN_VECTOR
            elif word == self.UNK_TOKEN:
                embedding_vector = self.UNK_TOKEN_VECTOR
            else:
                embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                embedding_vector = self.UNK_TOKEN_VECTOR
            self.en_embedding_matrix[i] = embedding_vector
    def predict_one_sentence(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction
Beispiel #4
0
class Seq2Seq2(BaseModel):
    def __init__(self):
        BaseModel.__init__(self)
        self.identifier = 'WordBasedSeq2Seq1000Units20EpochsGLOVE'

        self.params['batch_size'] = 64
        self.params['val_batch_size'] = 256
        self.params['epochs'] = 20
        self.params['latent_dim'] = 1000
        self.params['MAX_SEQ_LEN'] = 100
        self.params['EMBEDDING_DIM'] = 300
        self.params['MAX_WORDS_DE'] = 40000
        self.params['MAX_WORDS_EN'] = 40000
        self.params['P_DENSE_DROPOUT'] = 0.2
        self.params['VALIDATION_FREQ'] = 1

        self.BASE_DATA_DIR = "../../DataSets"
        self.BASIC_PERSISTENCE_DIR = '../../Persistence/' + self.identifier
        if not os.path.exists(self.BASIC_PERSISTENCE_DIR):
            os.makedirs(self.BASIC_PERSISTENCE_DIR)
        self.MODEL_DIR = os.path.join(self.BASIC_PERSISTENCE_DIR)
        self.GRAPH_DIR = os.path.join(self.BASIC_PERSISTENCE_DIR, 'Graph')
        self.MODEL_CHECKPOINT_DIR = os.path.join(self.BASIC_PERSISTENCE_DIR)
        self.WEIGHT_FILES = []

        dir = os.listdir(self.MODEL_CHECKPOINT_DIR)
        for file in dir:
            if file.endswith("hdf5"):
                self.WEIGHT_FILES.append(
                    os.path.join(self.MODEL_CHECKPOINT_DIR, file))
        self.WEIGHT_FILES.sort(
            key=lambda x: int(x.split('model.')[1].split('-')[0]))
        if len(self.WEIGHT_FILES) == 0:
            print("no weight files found")
        else:
            self.LATEST_MODELCHKPT = self.WEIGHT_FILES[len(self.WEIGHT_FILES) -
                                                       1]
        self.TRAIN_DATA_FILE = os.path.join(
            self.BASE_DATA_DIR, 'Training/DE_EN_(tatoeba)_train.txt')
        self.VAL_DATA_FILE = os.path.join(
            self.BASE_DATA_DIR, 'Validation/DE_EN_(tatoeba)_validation.txt')
        self.model_file = os.path.join(self.MODEL_DIR, 'model.h5')
        self.PRETRAINED_GLOVE_FILE = os.path.join(self.BASE_DATA_DIR,
                                                  'glove.6B.300d.txt')
        self.START_TOKEN = "_GO"
        self.END_TOKEN = "_EOS"
        self.UNK_TOKEN = "_UNK"

        self.preprocessing = False
        self.use_bleu_callback = False

    def __insert_valid_token_at_last_position(self, texts):
        for sent in texts:
            if not (sent[self.params['MAX_SEQ_LEN'] - 1] == 0
                    or sent[self.params['MAX_SEQ_LEN'] - 1] == 2):
                sent[self.params['MAX_SEQ_LEN'] - 1] = 2

    def __create_vocab(self):
        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.fit_on_texts(self.train_input_texts +
                                  self.val_input_texts)
        self.train_input_texts = en_tokenizer.texts_to_sequences(
            self.train_input_texts)
        self.train_input_texts = pad_sequences(
            self.train_input_texts,
            maxlen=self.params['MAX_SEQ_LEN'],
            padding='post',
            truncating='post')
        self.__insert_valid_token_at_last_position(self.train_input_texts)

        self.val_input_texts = en_tokenizer.texts_to_sequences(
            self.val_input_texts)
        self.val_input_texts = pad_sequences(self.val_input_texts,
                                             maxlen=self.params['MAX_SEQ_LEN'],
                                             padding='post',
                                             truncating='post')
        self.__insert_valid_token_at_last_position(self.val_input_texts)
        self.en_word_index = en_tokenizer.word_index

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.fit_on_texts(self.train_target_texts +
                                  self.val_target_texts)
        self.train_target_texts = de_tokenizer.texts_to_sequences(
            self.train_target_texts)
        self.train_target_texts = pad_sequences(
            self.train_target_texts,
            maxlen=self.params['MAX_SEQ_LEN'],
            padding='post',
            truncating='post')
        self.__insert_valid_token_at_last_position(self.train_target_texts)
        self.val_target_texts = de_tokenizer.texts_to_sequences(
            self.val_target_texts)
        self.val_target_texts = pad_sequences(
            self.val_target_texts,
            maxlen=self.params['MAX_SEQ_LEN'],
            padding='post',
            truncating='post')
        self.__insert_valid_token_at_last_position(self.val_target_texts)
        self.de_word_index = de_tokenizer.word_index

        embeddings_index = {}
        filename = self.PRETRAINED_GLOVE_FILE
        with open(filename, 'r', encoding='utf8') as f:
            for line in f.readlines():
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(embeddings_index))

        self.num_train_words = self.params['MAX_WORDS_EN'] + 3
        self.en_embedding_matrix = np.zeros(
            (self.num_train_words, self.params['EMBEDDING_DIM']))
        for word, i in self.en_word_index.items():
            if i >= self.params['MAX_WORDS_EN'] + 3:
                continue
            embedding_vector = None
            if word == self.START_TOKEN:
                embedding_vector = self.START_TOKEN_VECTOR
            elif word == self.END_TOKEN:
                embedding_vector = self.END_TOKEN_VECTOR
            elif word == self.UNK_TOKEN:
                embedding_vector = self.UNK_TOKEN_VECTOR
            else:
                embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                embedding_vector = self.UNK_TOKEN_VECTOR
            self.en_embedding_matrix[i] = embedding_vector

    def start_training(self):
        if self.preprocessing is True:
            self.START_TOKEN_VECTOR = np.random.rand(
                self.params['EMBEDDING_DIM'])
            self.END_TOKEN_VECTOR = np.random.rand(
                self.params['EMBEDDING_DIM'])
            self.UNK_TOKEN_VECTOR = np.random.rand(
                self.params['EMBEDDING_DIM'])

            self.train_input_texts, self.train_target_texts = self.__split_data(
                self.TRAIN_DATA_FILE)
            self.num_train_samples = len(self.train_input_texts)
            self.val_input_texts, self.val_target_texts = self.__split_data(
                self.VAL_DATA_FILE,
                save_unpreprocessed_targets=self.use_bleu_callback)
            self.__create_vocab()
            if self.use_bleu_callback is False:
                np.save(self.BASIC_PERSISTENCE_DIR + '/val_target_texts.npy',
                        self.val_target_texts)
            np.save(self.BASIC_PERSISTENCE_DIR + '/train_target_texts.npy',
                    self.train_target_texts)
            np.save(self.BASIC_PERSISTENCE_DIR + '/train_input_texts.npy',
                    self.train_input_texts)
            np.save(self.BASIC_PERSISTENCE_DIR + '/val_input_texts.npy',
                    self.val_input_texts)
            np.save(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy',
                    self.en_word_index)
            np.save(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy',
                    self.de_word_index)
            np.save(self.BASIC_PERSISTENCE_DIR + '/en_embedding_matrix.npy',
                    self.en_embedding_matrix)

        else:
            self.train_input_texts = np.load(self.BASIC_PERSISTENCE_DIR +
                                             '/train_input_texts.npy')
            self.train_target_texts = np.load(self.BASIC_PERSISTENCE_DIR +
                                              '/train_target_texts.npy')
            self.val_input_texts = np.load(self.BASIC_PERSISTENCE_DIR +
                                           '/val_input_texts.npy')
            self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                         '/en_word_index.npy')
            self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                         '/de_word_index.npy')
            self.en_word_index = self.en_word_index.item()
            self.de_word_index = self.de_word_index.item()
            self.en_embedding_matrix = np.load(self.BASIC_PERSISTENCE_DIR +
                                               '/en_embedding_matrix.npy')

            if self.use_bleu_callback:
                self.val_target_texts_no_preprocessing = []
                lines = open(self.VAL_DATA_FILE,
                             encoding='UTF-8').read().split('\n')
                for line in lines:
                    _, target_text = line.split('\t')
                    self.val_target_texts_no_preprocessing.append(target_text)
            else:
                self.val_target_texts = np.load(self.BASIC_PERSISTENCE_DIR +
                                                '/val_target_texts.npy')

        self.num_train_samples = len(self.train_input_texts)

        self.__setup_model(mode='training')

        tbCallBack = callbacks.TensorBoard(log_dir=self.GRAPH_DIR,
                                           histogram_freq=0,
                                           write_grads=True,
                                           write_graph=True,
                                           write_images=True)
        modelCallback = callbacks.ModelCheckpoint(
            self.MODEL_CHECKPOINT_DIR + '/model.{epoch:03d}-{loss:.3f}.hdf5',
            monitor='loss',
            verbose=1,
            save_best_only=False,
            save_weights_only=True,
            mode='auto',
            period=self.params['epochs'])

        if self.use_bleu_callback:
            bleuCallback = BleuCallback(self.de_word_index, self.START_TOKEN,
                                        self.END_TOKEN, self.val_input_texts,
                                        self.val_target_texts_no_preprocessing,
                                        self.params['epochs'])
            used_callbacks = [tbCallBack, modelCallback, bleuCallback]
        evalCallback = EvalCallback(
            self.__serve_batch(self.val_input_texts, self.val_target_texts,
                               'val'),
            int(len(self.val_input_texts) / self.params['val_batch_size']),
            self.identifier,
            frequency=self.params['VALIDATION_FREQ'])
        used_callbacks = [tbCallBack, modelCallback, evalCallback]
        self.M.fit_generator(
            self.__serve_batch(self.train_input_texts, self.train_target_texts,
                               'train'),
            int(len(self.train_input_texts) / self.params['batch_size']),
            epochs=self.params['epochs'],
            verbose=2,
            callbacks=used_callbacks,
            max_queue_size=1,
            suffle=True)
        self.M.save(self.model_file)

    def __split_data(self, file, save_unpreprocessed_targets=False):
        """
        Reads the data from the given file.
        The two languages in the file have to be splitted by a tab
        :param file: file which should be read from
        :return: (input_texts, target_texts)
        """
        input_texts = []
        target_texts = []
        lines = open(file, encoding='UTF-8').read().split('\n')
        for line in lines:
            input_text, target_text = line.split('\t')
            input_texts.append(input_text)
            target_texts.append(target_text)
        if save_unpreprocessed_targets is True:
            self.val_target_texts_no_preprocessing = target_texts.copy()
        assert len(input_texts) == len(target_texts)
        return input_texts, target_texts

    def load(file):
        """
        Loads the given file into a list.
        :param file: the file which should be loaded
        :return: list of data
        """
        with (open(file, encoding='utf8')) as file:
            data = file.readlines()
            # data = []
            # for i in range(MAX_SENTENCES):
            #    data.append(lines[i])
        print('Loaded', len(data), "lines of data.")
        return data

    def __serve_batch(self, input_texts, target_texts, mode):
        batch_size = self.params['batch_size']
        if mode != 'train':
            batch_size = self.params['val_batch_size']

        counter = 0
        batch_X = np.zeros((batch_size, self.params['MAX_SEQ_LEN']),
                           dtype='int16')
        batch_Y = np.zeros((batch_size, self.params['MAX_SEQ_LEN'],
                            self.params['MAX_WORDS_DE'] + 3),
                           dtype='int16')
        while True:
            for i in range(input_texts.shape[0]):
                in_X = input_texts[i]
                out_Y = np.zeros((1, target_texts.shape[1],
                                  self.params['MAX_WORDS_DE'] + 3),
                                 dtype='int16')
                token_counter = 0
                for token in target_texts[i]:
                    out_Y[0, token_counter, :] = to_categorical(
                        token, num_classes=self.params['MAX_WORDS_DE'] + 3)
                    token_counter += 1
                batch_X[counter] = in_X
                batch_Y[counter] = out_Y
                counter += 1
                if counter == batch_size:
                    print("counter == batch_size", i, mode)
                    counter = 0
                    yield batch_X, batch_Y

    def __setup_model(self, mode=None):
        if mode not in ['predict', 'training']:
            exit("wrong mode for setup_model")

        if mode == 'predict':
            try:
                test = self.en_embedding_matrix
                test = self.M
                return
            except AttributeError:
                pass
            self.en_embedding_matrix = np.load(self.BASIC_PERSISTENCE_DIR +
                                               '/en_embedding_matrix.npy')

        self.M = Sequential()
        self.M.add(
            Embedding(self.params['MAX_WORDS_EN'] + 3,
                      self.params['EMBEDDING_DIM'],
                      weights=[self.en_embedding_matrix],
                      mask_zero=True,
                      trainable=False))

        self.M.add(
            LSTM(self.params['latent_dim'],
                 return_sequences=True,
                 name='encoder'))
        if mode == 'training':
            self.M.add(Dropout(self.params['P_DENSE_DROPOUT']))

        # M.add(LSTM(self.params['latent_dim'] * int(1 / self.params['P_DENSE_DROPOUT']), return_sequences=True))
        self.M.add(LSTM(self.params['latent_dim'], return_sequences=True))
        if mode == 'training':
            self.M.add(Dropout(self.params['P_DENSE_DROPOUT']))

        self.M.add(
            TimeDistributed(
                Dense(self.params['MAX_WORDS_DE'] + 3,
                      input_shape=(None, self.params['MAX_SEQ_LEN'],
                                   self.params['MAX_WORDS_DE'] + 3),
                      activation='softmax')))

        print('compiling')

        self.M.compile(optimizer='Adam', loss='categorical_crossentropy')
        self.M.summary()

        print('compiled')

        if mode == 'predict':
            self.M.load_weights(self.LATEST_MODELCHKPT)

    def __setup_helpers(self):
        try:
            self.en_word_index
            return
        except Exception as e:
            pass
        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')
        self.de_word_index = self.de_word_index.item()
        self.en_word_index = self.en_word_index.item()

        self.en_tokenizer = Tokenizer(self.START_TOKEN,
                                      self.END_TOKEN,
                                      self.UNK_TOKEN,
                                      num_words=self.params['MAX_WORDS_EN'])
        self.en_tokenizer.word_index = self.en_word_index
        self.en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        self.de_tokenizer = Tokenizer(self.START_TOKEN,
                                      self.END_TOKEN,
                                      self.UNK_TOKEN,
                                      num_words=self.params['MAX_WORDS_DE'])
        self.de_tokenizer.word_index = self.de_word_index
        self.de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

    def predict_one_sentence(self, sentence):
        self.__setup_model(mode='predict')
        self.__setup_helpers()

        sentence = self.en_tokenizer.texts_to_sequences([sentence],
                                                        print_unk_warning=True)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        self.__insert_valid_token_at_last_position(sentence)
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])

        prediction = self.M.predict(sentence)
        print(prediction.shape)
        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence

    def predict_batch(self, sentences, all_weights=False):
        self.__setup_model(mode='predict')
        self.__setup_helpers()

        sentences = self.en_tokenizer.texts_to_sequences(
            sentences, print_unk_warning=True)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        self.__insert_valid_token_at_last_position(sentences)
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        predictions_for_weights = {}
        if all_weights is True:
            for weight_file in self.WEIGHT_FILES:
                self.M.load_weights(weight_file)
                predictions_for_weights[weight_file.split('model.')[
                    1]] = self.__predict_batch_for_specific_weight(sentences)
        else:
            predictions_for_weights[self.LATEST_MODELCHKPT.split(
                'model.')[1]] = self.__predict_batch_for_specific_weight()

        return predictions_for_weights

    def __predict_batch_for_specific_weight(self, sentences):
        batch_size = sentences.shape[0]
        if batch_size > 20:
            batch_size = 20
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())

        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print(
                            "second best prediction is ",
                            reverse_word_index[np.argmax(
                                np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size

            if from_idx > sentences.shape[0]:
                break
            elif from_idx == sentences.shape[0]:
                to_idx = from_idx + 1
            elif to_idx > sentences.shape[0] and from_idx < sentences.shape[0]:
                to_idx = sentences.shape[0] + 1
            elif to_idx > sentences.shape[0]:
                break
        return predicted_sentences

    def calculate_hiddenstate_after_encoder(self, sentences):
        self.__setup_model(mode='predict')
        self.__setup_helpers()

        sentences = self.en_tokenizer.texts_to_sequences(
            sentences, print_unk_warning=True)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        self.__insert_valid_token_at_last_position(sentences)
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])
        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer('encoder').output)

        batch_size = sentences.shape[0]
        if batch_size > 20:
            batch_size = 20

        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size

        hiddenstates = []
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            hiddenstates.append(
                encoder.predict(current_batch, batch_size=batch_size))

            from_idx += batch_size
            to_idx += batch_size

            if from_idx > sentences.shape[0]:
                break
            elif from_idx == sentences.shape[0]:
                to_idx = from_idx + 1
            elif to_idx > sentences.shape[0] and from_idx < sentences.shape[0]:
                to_idx = sentences.shape[0] + 1
            elif to_idx > sentences.shape[0]:
                break
        return hiddenstates

    def calculate_every_hiddenstate_after_encoder(self, sentence):
        raise NotImplementedError()

    def calculate_every_hiddenstate(self, sentence):
        raise NotImplementedError()

    def calculate_hiddenstate_after_decoder(self, sentence):
        raise NotImplementedError()

    def setup_inference(self):
        raise NotImplementedError()
Beispiel #5
0
exit()
input_texts = []
target_texts = []
lines = open('../../DataSets/Training/deu.txt',
             encoding='UTF-8').read().split('\n')
for line in lines:
    input_text, target_text = line.split('\t')
    input_texts.append(input_text)
    target_text = target_text
    target_texts.append(target_text)
num_samples = len(input_texts)

en_tokenizer = Tokenizer("GO_", "_EOS", "_UNK", num_words=30000)
en_tokenizer.fit_on_texts(input_texts)
train_input_texts = en_tokenizer.texts_to_sequences(input_texts)
lengths = []
for text in train_input_texts:
    lengths.append(len(text))
print(len(lengths))
import numpy as np

print(np.max(np.array(lengths)))
train_input_texts = pad_sequences(train_input_texts,
                                  maxlen=100,
                                  padding='post',
                                  truncating='post')
en_word_index = en_tokenizer.word_index

exit()
    def predict_batch(self, sentences):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentences)
        sentences = en_tokenizer.texts_to_sequences(sentences)
        print(sentences)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print(
                            "second best prediction is ",
                            reverse_word_index[np.argmax(
                                np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo accept not multiple of batchsize
                break
        return predicted_sentences