dict_file = '/home/gian/datasets/dict.sort.' train_file = '/home/gian/datasets/fapesp/fapesp-v2.tok.train.' valid_file = '/home/gian/datasets/fapesp/fapesp-v2.tok.dev.' source_lang = 'en' target_lang = 'pt' en_v_size = 1000 pt_v_size = 1000 dim_proj = 10 batch_size = 128 n_epochs = 5 seed = 1234 en_dict = load_dictionary(dict_file + source_lang, max_words=en_v_size) pt_dict = load_dictionary(dict_file + target_lang, max_words=pt_v_size) print 'Initializing...' time1 = time.time() # define the dataset for training train_data = DatasetIterator(train_file + source_lang, train_file + target_lang, en_dict, pt_dict) # define the dataset for validation valid_data = DatasetIterator(valid_file + source_lang, valid_file + target_lang, en_dict,
# define the decoder architecture emb2 = Embedding(pt_v_size, dim_proj, seed=seed) lstm3 = LSTM(dim_proj, dim_proj, seed=seed) lstm4 = LSTM(dim_proj, pt_v_size, seed=seed) decoder = [emb2, lstm3, lstm4] # ensemble the sequence-to-sequence model seq = SequenceToSequence(encoder=encoder, decoder=decoder, source_v_size=en_v_size, target_v_size=pt_v_size, auto_setup=False) # set auto_setup to false to avoid initialization # (weights will be overwritten anyway) # load source and target language dictionaries sr_dict = load_dictionary('/home/gian/datasets/dict.sort.en', max_words=en_v_size) tr_dict = load_dictionary('/home/gian/datasets/dict.sort.pt', max_words=pt_v_size) # load the corpora and convert its words to their indexes (corpora must be already tokenized) sequences1 = load_and_convert_corpora('/home/gian/datasets/fapesp/fapesp-v2.tok.test-a.en', sr_dict) sequences2 = load_and_convert_corpora('/home/gian/datasets/fapesp/fapesp-v2.tok.test-a.pt', tr_dict) # prepare the data (add padding values to the end of each sequence so they have the same size) seq.load_weights('/home/gian/seq_to_seq.hp5y') seq.train(sequences1, sequences2, n_epochs=2, print_train_info=True) #