# In[4]: BATCH_SIZE = 16 PAD_token = 0 PAD_TOKEN = 0 SOS_token = 1 EOS_token = 2 UNK_token = 3 teacher_forcing_ratio = 1.0 attn_model = 'dot' # In[5]: train_idx_pairs = load_cpickle_gc( "./iwslt-vi-eng/preprocessed_no_indices_pairs_train_tokenized") # In[37]: input_lang = load_cpickle_gc("iwslt-vi-eng/preprocessed_no_elmo_vilang") # In[38]: target_lang = load_cpickle_gc("iwslt-vi-eng/preprocessed_no_elmo_englang") # In[8]: val_idx_pairs = pickle.load( open("iwslt-vi-eng/preprocessed_no_indices_pairs_validation_tokenized", 'rb'))
torch.from_numpy(np.array(sent2_list)), torch.LongTensor(sent2_length_list)] #input_lang, target_lang, train_pairs = prepareTrainData( # "iwslt-vi-en-processed/train.tok.vi", # "iwslt-vi-en-processed/train.tok.en", # input_lang = 'vi', # target_lang = 'en') #_, _, test_pairs= prepareTrainData( # "iwslt-vi-en-processed/test.vi", # "iwslt-vi-en-processed/test.en", # input_lang = 'vi', # target_lang = 'en') input_lang = load_cpickle_gc("input_lang_vi") target_lang = load_cpickle_gc("target_lang_en") #test_idx_pairs = [] #for x in test_pairs: # indexed = list(tensorsFromPair(x, input_lang, target_lang)) # test_idx_pairs.append(indexed) train_idx_pairs = load_cpickle_gc("train_vi_en_idx_pairs") train_idx_pairs = train_idx_pairs[:-5] val_idx_pairs = load_cpickle_gc("val_idx_pairs") val_pairs = load_cpickle_gc("val_pairs") print(len(train_idx_pairs))
# _, _, val_pairs = prepareData( # "iwslt-vi-en-processed/dev.vi", # "iwslt-vi-en-processed/dev.en", # input_lang = 'vi', # target_lang = 'en') # val_idx_pairs = [] # for x in val_pairs: # indexed = list(tensorsFromPair(x, input_lang, target_lang)) # val_idx_pairs.append(indexed) # pickle.dump(val_pairs, open("val_pairs", "wb")) # pickle.dump(val_idx_pairs, open("val_idx_pairs", "wb")) input_lang = load_cpickle_gc("input_lang_vi") target_lang = load_cpickle_gc("target_lang_en") train_idx_pairs = load_cpickle_gc("train_vi_en_idx_pairs") val_idx_pairs = load_cpickle_gc("val_idx_pairs") val_pairs = load_cpickle_gc("val_pairs") train_dataset = LanguagePairDataset(train_idx_pairs) # is there anything in the train_idx_pairs that is only 0s right now instead of padding. train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=BATCH_SIZE, collate_fn=language_pair_dataset_collate_function, ) hidden_size = 256 attn_model = 'dot'