Esempio n. 1
0
def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text, en_timesteps,
                    fr_timesteps):
    """ Preprocessing data and getting a sequence of word indices """

    en_seq = sents2sequences(en_tokenizer,
                             en_text,
                             reverse=False,
                             padding_type='pre',
                             pad_length=en_timesteps)
    fr_seq = sents2sequences(fr_tokenizer, fr_text, pad_length=fr_timesteps)
    logger.info('Vocabulary size (English): {}'.format(np.max(en_seq) + 1))
    logger.info('Vocabulary size (French): {}'.format(np.max(fr_seq) + 1))
    logger.debug('En text shape: {}'.format(en_seq.shape))
    logger.debug('Fr text shape: {}'.format(fr_seq.shape))

    return en_seq, fr_seq
Esempio n. 2
0
def infer_nmt(encoder_model, decoder_model, test_en_seq, en_vsize, fr_vsize):
    """
    Infer logic
    :param encoder_model: keras.Model
    :param decoder_model: keras.Model
    :param test_en_seq: sequence of word ids
    :param en_vsize: int
    :param fr_vsize: int
    :return:
    """

    test_fr_seq = sents2sequences(fr_tokenizer, ['sos'], fr_vsize)
    test_en_onehot_seq = to_categorical(test_en_seq, num_classes=en_vsize)
    test_fr_onehot_seq = np.expand_dims(
        to_categorical(test_fr_seq, num_classes=fr_vsize), 1)

    enc_outs, enc_fwd_state, enc_back_state = encoder_model.predict(
        test_en_onehot_seq)
    dec_state = np.concatenate([enc_fwd_state, enc_back_state], axis=-1)
    attention_weights = []
    fr_text = ''

    for i in range(fr_timesteps):

        dec_out, attention, dec_state = decoder_model.predict(
            [enc_outs, dec_state, test_fr_onehot_seq])
        dec_ind = np.argmax(dec_out, axis=-1)[0, 0]

        if dec_ind == 0:
            break
        test_fr_seq = sents2sequences(fr_tokenizer, [fr_index2word[dec_ind]],
                                      fr_vsize)
        test_fr_onehot_seq = np.expand_dims(
            to_categorical(test_fr_seq, num_classes=fr_vsize), 1)

        attention_weights.append((dec_ind, attention))
        fr_text += fr_index2word[dec_ind] + ' '

    return fr_text, attention_weights
Esempio n. 3
0
        fr_vsize=fr_vsize)

    n_epochs = 10 if not debug else 3
    train(full_model, en_seq, fr_seq, batch_size, n_epochs)
    """ Save model """
    if not os.path.exists(os.path.join('..', 'h5.models')):
        os.mkdir(os.path.join('..', 'h5.models'))
    full_model.save(os.path.join('..', 'h5.models', 'nmt.h5'))
    """ Index2word """
    en_index2word = dict(
        zip(en_tokenizer.word_index.values(), en_tokenizer.word_index.keys()))
    fr_index2word = dict(
        zip(fr_tokenizer.word_index.values(), fr_tokenizer.word_index.keys()))
    """ Inferring with trained model """
    test_en = ts_en_text[0]
    logger.info('Translating: {}'.format(test_en))

    test_en_seq = sents2sequences(en_tokenizer, [test_en],
                                  pad_length=en_timesteps)
    test_fr, attn_weights = infer_nmt(encoder_model=infer_enc_model,
                                      decoder_model=infer_dec_model,
                                      test_en_seq=test_en_seq,
                                      en_vsize=en_vsize,
                                      fr_vsize=fr_vsize)
    logger.info('\tFrench: {}'.format(test_fr))
    """ Attention plotting """
    plot_attention_weights(test_en_seq,
                           attn_weights,
                           en_index2word,
                           fr_index2word,
                           base_dir=base_dir)