Ejemplo n.º 1
0
def test_sample(dataset):
    path_to_file = os.path.join(parent_dirname, "data", "spa-eng", "spa-sample.txt")
    input_tensor, target_tensor, inp_lang, targ_lang = nmt.load_dataset(path_to_file, 124)
    vocab_inp_size = len(inp_lang.word_index) + 1
    vocab_tar_size = len(targ_lang.word_index) + 1

    encoder = nmt.Encoder(vocab_inp_size, nmt.EMBEDDING_DIM, nmt.UNITS, nmt.BATCH_SIZE)
    decoder = nmt.Decoder(vocab_tar_size, nmt.EMBEDDING_DIM, nmt.UNITS, nmt.BATCH_SIZE)

    example_input_batch, example_target_batch = next(iter(dataset))

    sample_hidden = encoder.initialize_hidden_state()
    sample_output, sample_hidden = encoder.call(example_input_batch, sample_hidden)

    assert sample_output.shape == (64, 7, 1024)
    assert sample_hidden.shape == (64, 1024)

    attention_layer = nmt.BahdanauAttention(units=nmt.NUM_ATTENTION_UNITS)
    attention_result, attention_weights = attention_layer.call(sample_hidden, sample_output)

    assert attention_result.shape == (64, 1024)
    assert attention_weights.shape == (64, 7, 1)

    sample_decoder_output, _, _ = decoder.call(tf.random.uniform((64, 1)), sample_hidden, sample_output)

    assert sample_decoder_output.shape == (64, 71)
Ejemplo n.º 2
0
def dataset_fixture():
    path_to_file = os.path.join(parent_dirname, "data", "spa-eng", "spa-sample.txt")
    input_tensor, target_tensor, inp_lang, targ_lang = nmt.load_dataset(path_to_file, 124)
    buffer_size = len(input_tensor)
    return tf.data.Dataset.from_tensor_slices(
        (input_tensor, target_tensor)
    ).shuffle(buffer_size).batch(nmt.BATCH_SIZE, drop_remainder=True)
Ejemplo n.º 3
0
def main(sentence):
    path_to_file = os.path.join(DATA_DIR, "spa-eng", "spa.txt")
    if not os.path.isfile(path_to_file):
        download_data()

    input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(
        path_to_file, NUM_EXAMPLES)
    max_length_targ, max_length_inp = target_tensor.shape[
        1], input_tensor.shape[1]

    vocab_inp_size = len(inp_lang.word_index) + 1
    vocab_tar_size = len(targ_lang.word_index) + 1

    optimizer = tf.keras.optimizers.Adam()
    encoder = Encoder(vocab_inp_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
    decoder = Decoder(vocab_tar_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

    checkpoint_dir = f"{CHECKPOINTS_DIR}/training_checkpoints/spa2eng"

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    checkpoint.restore(
        tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding="post")
    inputs = tf.convert_to_tensor(inputs)
    result = ""

    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index["<start>"]], 0)

    for _ in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        next_word = targ_lang.index_word[predicted_id]
        result += next_word + " "
        if next_word == "<end>":
            break
        dec_input = tf.expand_dims([predicted_id], 0)

    result_clean = result.replace("<start>", "").replace("<end>", "")
    return result_clean
Ejemplo n.º 4
0
def main(sentence):
    path_to_file = os.path.join(DATA_DIR, "beatles_lyrics_combined",
                                "grapheme2phoneme.txt")

    phone_tensor, graph_tensor, phone_lang, graph_lang = load_dataset(
        path_to_file, NUM_EXAMPLES)

    max_length_phone = phone_tensor.shape[1]
    max_length_graph = graph_tensor.shape[1]

    vocab_phone_size = len(phone_lang.word_index) + 1
    vocab_graph_size = len(graph_lang.word_index) + 1

    optimizer = tf.keras.optimizers.Adam()
    encoder = Encoder(vocab_phone_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
    decoder = Decoder(vocab_graph_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    checkpoint.restore(
        tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()

    sentence = preprocess_sentence(sentence)

    inputs = [phone_lang.word_index[i] for i in sentence.split(" ")]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_phone, padding="post")
    inputs = tf.convert_to_tensor(inputs)
    result = ""

    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([graph_lang.word_index["<start>"]], 0)

    for _ in range(max_length_graph):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        next_word = graph_lang.index_word[predicted_id]
        result += next_word + " "
        if next_word == "<end>":
            break
        dec_input = tf.expand_dims([predicted_id], 0)

    result_clean = result.replace("<start>", "").replace("<end>", "")
    return result_clean
Ejemplo n.º 5
0
    path_to_file = os.path.join(DATA_DIR, "beatles_lyrics_combined",
                                "grapheme2phoneme.txt")
    if not os.path.isfile(path_to_file):
        print(f"cannot find data {path_to_file}. exit")
        sys.exit(1)

    grapheme_sentence = "Baby, you can drive my car"
    phoneme_sentence = "B EY1 B IY0 Y UW1 K AE1 N D R AY1 V M AY1 K AA1 R"
    print(preprocess_sentence(grapheme_sentence))
    print(preprocess_sentence(phoneme_sentence).encode("utf-8"))

    graph, phone = create_dataset(path_to_file, NUM_EXAMPLES)
    print(graph[-1])
    print(phone[-1])

    phone_tensor, graph_tensor, phone_lang, graph_lang = load_dataset(
        path_to_file, NUM_EXAMPLES)
    max_length_graph, max_length_phone = graph_tensor.shape[
        1], phone_tensor.shape[1]
    (
        phone_tensor_train,
        phone_tensor_val,
        graph_tensor_train,
        graph_tensor_val,
    ) = train_test_split(phone_tensor, graph_tensor, test_size=0.2)

    print(
        len(phone_tensor_train),
        len(graph_tensor_train),
        len(phone_tensor_val),
        len(graph_tensor_val),
    )
Ejemplo n.º 6
0
if __name__ == "__main__":
    path_to_file = os.path.join(DATA_DIR, "spa-eng", "spa.txt")
    if not os.path.isfile(path_to_file):
        download_data()

    en_sentence = "May I borrow this book?"
    sp_sentence = "¿Puedo tomar prestado este libro?"
    print(preprocess_sentence(en_sentence))
    print(preprocess_sentence(sp_sentence).encode("utf-8"))

    en, sp = create_dataset(path_to_file, NUM_EXAMPLES)
    print(en[-1])
    print(sp[-1])

    spa_tensor, eng_tensor, spa_lang, eng_lang = load_dataset(
        path_to_file, NUM_EXAMPLES)
    max_length_eng, max_length_spa = eng_tensor.shape[1], spa_tensor.shape[1]
    (
        spa_tensor_train,
        spa_tensor_val,
        eng_tensor_train,
        eng_tensor_val,
    ) = train_test_split(spa_tensor, eng_tensor, test_size=0.2)

    print(
        len(spa_tensor_train),
        len(eng_tensor_train),
        len(spa_tensor_val),
        len(eng_tensor_val),
    )