dict_file = '/home/gian/datasets/dict.sort.'
train_file = '/home/gian/datasets/fapesp/fapesp-v2.tok.train.'
valid_file = '/home/gian/datasets/fapesp/fapesp-v2.tok.dev.'
source_lang = 'en'
target_lang = 'pt'

en_v_size = 1000
pt_v_size = 1000
dim_proj = 10

batch_size = 128
n_epochs = 5

seed = 1234

en_dict = load_dictionary(dict_file + source_lang, max_words=en_v_size)
pt_dict = load_dictionary(dict_file + target_lang, max_words=pt_v_size)

print 'Initializing...'
time1 = time.time()

# define the dataset for training
train_data = DatasetIterator(train_file + source_lang,
                             train_file + target_lang,
                             en_dict,
                             pt_dict)

# define the dataset for validation
valid_data = DatasetIterator(valid_file + source_lang,
                             valid_file + target_lang,
                             en_dict,
# define the decoder architecture
emb2 = Embedding(pt_v_size, dim_proj, seed=seed)
lstm3 = LSTM(dim_proj, dim_proj, seed=seed)
lstm4 = LSTM(dim_proj, pt_v_size, seed=seed)
decoder = [emb2, lstm3, lstm4]

# ensemble the sequence-to-sequence model
seq = SequenceToSequence(encoder=encoder,
                         decoder=decoder,
                         source_v_size=en_v_size,
                         target_v_size=pt_v_size,
                         auto_setup=False)  # set auto_setup to false to avoid initialization
                         # (weights will be overwritten anyway)

# load source and target language dictionaries
sr_dict = load_dictionary('/home/gian/datasets/dict.sort.en', max_words=en_v_size)
tr_dict = load_dictionary('/home/gian/datasets/dict.sort.pt', max_words=pt_v_size)

# load the corpora and convert its words to their indexes (corpora must be already tokenized)
sequences1 = load_and_convert_corpora('/home/gian/datasets/fapesp/fapesp-v2.tok.test-a.en', sr_dict)
sequences2 = load_and_convert_corpora('/home/gian/datasets/fapesp/fapesp-v2.tok.test-a.pt', tr_dict)

# prepare the data (add padding values to the end of each sequence so they have the same size)

seq.load_weights('/home/gian/seq_to_seq.hp5y')

seq.train(sequences1,
          sequences2,
          n_epochs=2,
          print_train_info=True)
#