def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text, en_timesteps, fr_timesteps): """ Preprocessing data and getting a sequence of word indices """ en_seq = sents2sequences(en_tokenizer, en_text, reverse=False, padding_type='pre', pad_length=en_timesteps) fr_seq = sents2sequences(fr_tokenizer, fr_text, pad_length=fr_timesteps) logger.info('Vocabulary size (English): {}'.format(np.max(en_seq) + 1)) logger.info('Vocabulary size (French): {}'.format(np.max(fr_seq) + 1)) logger.debug('En text shape: {}'.format(en_seq.shape)) logger.debug('Fr text shape: {}'.format(fr_seq.shape)) return en_seq, fr_seq
def infer_nmt(encoder_model, decoder_model, test_en_seq, en_vsize, fr_vsize): """ Infer logic :param encoder_model: keras.Model :param decoder_model: keras.Model :param test_en_seq: sequence of word ids :param en_vsize: int :param fr_vsize: int :return: """ test_fr_seq = sents2sequences(fr_tokenizer, ['sos'], fr_vsize) test_en_onehot_seq = to_categorical(test_en_seq, num_classes=en_vsize) test_fr_onehot_seq = np.expand_dims( to_categorical(test_fr_seq, num_classes=fr_vsize), 1) enc_outs, enc_fwd_state, enc_back_state = encoder_model.predict( test_en_onehot_seq) dec_state = np.concatenate([enc_fwd_state, enc_back_state], axis=-1) attention_weights = [] fr_text = '' for i in range(fr_timesteps): dec_out, attention, dec_state = decoder_model.predict( [enc_outs, dec_state, test_fr_onehot_seq]) dec_ind = np.argmax(dec_out, axis=-1)[0, 0] if dec_ind == 0: break test_fr_seq = sents2sequences(fr_tokenizer, [fr_index2word[dec_ind]], fr_vsize) test_fr_onehot_seq = np.expand_dims( to_categorical(test_fr_seq, num_classes=fr_vsize), 1) attention_weights.append((dec_ind, attention)) fr_text += fr_index2word[dec_ind] + ' ' return fr_text, attention_weights
fr_vsize=fr_vsize) n_epochs = 10 if not debug else 3 train(full_model, en_seq, fr_seq, batch_size, n_epochs) """ Save model """ if not os.path.exists(os.path.join('..', 'h5.models')): os.mkdir(os.path.join('..', 'h5.models')) full_model.save(os.path.join('..', 'h5.models', 'nmt.h5')) """ Index2word """ en_index2word = dict( zip(en_tokenizer.word_index.values(), en_tokenizer.word_index.keys())) fr_index2word = dict( zip(fr_tokenizer.word_index.values(), fr_tokenizer.word_index.keys())) """ Inferring with trained model """ test_en = ts_en_text[0] logger.info('Translating: {}'.format(test_en)) test_en_seq = sents2sequences(en_tokenizer, [test_en], pad_length=en_timesteps) test_fr, attn_weights = infer_nmt(encoder_model=infer_enc_model, decoder_model=infer_dec_model, test_en_seq=test_en_seq, en_vsize=en_vsize, fr_vsize=fr_vsize) logger.info('\tFrench: {}'.format(test_fr)) """ Attention plotting """ plot_attention_weights(test_en_seq, attn_weights, en_index2word, fr_index2word, base_dir=base_dir)