def test_create_dataset_phone(): output = nmt.create_dataset( path=os.path.join(data_dir, "phonemes", "grapheme2phoneme-sample.txt"), num_examples=5 ) assert [_ for _ in output] == [ ('<start> title set fire to that lot ! speech <end>', '<start> rodney burke i ve got one card here and it s on the same subject , ' 'eh , but this one says , we think the show is great and we dig the beatles ' 'the most , but we still haven t heard a word from ringo yet . <end>', '<start> ringo arf ! arf ! arf ! arf ! <end>', '<start> rodney and how about him singing ? well , what will you sing for us ' ', ringo ? will you say a few words ? <end>', '<start> ringo hello , there , kiddies . i d like to sing a song for you ' 'today called matchbox . there you go <end>'), ('<start> t ay t ah l s eh t f ay er t uw dh ae t l aa t s p iy ch <end>', '<start> r aa d n iy b er k g aa t w ah n k aa r d hh iy r ah n d ih t s aa ' 'n dh ah s ey m s ah b jh eh k t eh b ah t dh ih s w ah n s eh z w iy th ih ' 'ng k dh ah sh ow ih z g r ey t ah n d w iy d ih g dh ah b iy t ah l z dh ah ' 'm ow s t b ah t w iy s t ih l aeavehnt hh er d ah w er d f r ah m r iy ng g ' 'ow y eh t <end>', '<start> r iy ng g ow ahrf ahrf ahrf ahrf <end>', '<start> r aa d n iy ah n d hh aw ah b aw t hh ih m s ih ng ih ng w eh l w ' 'ah t w ih l y uw s ih ng f ao r ah s r iy ng g ow w ih l y uw s ey ah f y ' 'uw w er d z <end>', '<start> r iy ng g ow hh ah l ow dh eh r k ih d iy z ih d l ay k t uw s ih ' 'ng ah s ao ng f ao r y uw t ah d ey k ao l d m ae ch b aa k s dh eh r y uw ' 'g ow <end>')]
def test_create_dataset(): output = nmt.create_dataset( path=os.path.join(data_dir, "spa-eng", "spa-sample.txt"), num_examples=5 ) assert [_ for _ in output] == [ ('<start> go . <end>', '<start> go . <end>', '<start> go . <end>', '<start> go . <end>', '<start> hi . <end>'), ('<start> ve . <end>', '<start> vete . <end>', '<start> vaya . <end>', '<start> vayase . <end>', '<start> hola . <end>') ]
def eng_fixture(): path_to_file = os.path.join(parent_dirname, "data", "spa-eng", "spa-sample.txt") targ_lang, inp_lang = nmt.create_dataset(path_to_file, 10) return inp_lang
plot_attention(attention_plot, sentence.split(" "), result.split(" ")) if __name__ == "__main__": path_to_file = os.path.join(DATA_DIR, "beatles_lyrics_combined", "grapheme2phoneme.txt") if not os.path.isfile(path_to_file): print(f"cannot find data {path_to_file}. exit") sys.exit(1) grapheme_sentence = "Baby, you can drive my car" phoneme_sentence = "B EY1 B IY0 Y UW1 K AE1 N D R AY1 V M AY1 K AA1 R" print(preprocess_sentence(grapheme_sentence)) print(preprocess_sentence(phoneme_sentence).encode("utf-8")) graph, phone = create_dataset(path_to_file, NUM_EXAMPLES) print(graph[-1]) print(phone[-1]) phone_tensor, graph_tensor, phone_lang, graph_lang = load_dataset( path_to_file, NUM_EXAMPLES) max_length_graph, max_length_phone = graph_tensor.shape[ 1], phone_tensor.shape[1] ( phone_tensor_train, phone_tensor_val, graph_tensor_train, graph_tensor_val, ) = train_test_split(phone_tensor, graph_tensor, test_size=0.2) print(
attention_plot = attention_plot[:len(result.split(" ") ), :len(sentence.split(" "))] plot_attention(attention_plot, sentence.split(" "), result.split(" ")) if __name__ == "__main__": path_to_file = os.path.join(DATA_DIR, "spa-eng", "spa.txt") if not os.path.isfile(path_to_file): download_data() en_sentence = "May I borrow this book?" sp_sentence = "¿Puedo tomar prestado este libro?" print(preprocess_sentence(en_sentence)) print(preprocess_sentence(sp_sentence).encode("utf-8")) en, sp = create_dataset(path_to_file, NUM_EXAMPLES) print(en[-1]) print(sp[-1]) spa_tensor, eng_tensor, spa_lang, eng_lang = load_dataset( path_to_file, NUM_EXAMPLES) max_length_eng, max_length_spa = eng_tensor.shape[1], spa_tensor.shape[1] ( spa_tensor_train, spa_tensor_val, eng_tensor_train, eng_tensor_val, ) = train_test_split(spa_tensor, eng_tensor, test_size=0.2) print( len(spa_tensor_train),