Ejemplo n.º 1
0

# In[4]:

BATCH_SIZE = 16
PAD_token = 0
PAD_TOKEN = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3
teacher_forcing_ratio = 1.0
attn_model = 'dot'

# In[5]:

train_idx_pairs = load_cpickle_gc(
    "./iwslt-vi-eng/preprocessed_no_indices_pairs_train_tokenized")

# In[37]:

input_lang = load_cpickle_gc("iwslt-vi-eng/preprocessed_no_elmo_vilang")

# In[38]:

target_lang = load_cpickle_gc("iwslt-vi-eng/preprocessed_no_elmo_englang")

# In[8]:

val_idx_pairs = pickle.load(
    open("iwslt-vi-eng/preprocessed_no_indices_pairs_validation_tokenized",
         'rb'))
            torch.from_numpy(np.array(sent2_list)), torch.LongTensor(sent2_length_list)]


#input_lang, target_lang, train_pairs = prepareTrainData(
#    "iwslt-vi-en-processed/train.tok.vi",
#    "iwslt-vi-en-processed/train.tok.en",
#    input_lang = 'vi',
#    target_lang = 'en')

#_, _, test_pairs= prepareTrainData(
#    "iwslt-vi-en-processed/test.vi",
#    "iwslt-vi-en-processed/test.en",
#    input_lang = 'vi',
#    target_lang = 'en')

input_lang = load_cpickle_gc("input_lang_vi")
target_lang = load_cpickle_gc("target_lang_en")

#test_idx_pairs = []
#for x in test_pairs:
#    indexed = list(tensorsFromPair(x, input_lang, target_lang))
#    test_idx_pairs.append(indexed)

train_idx_pairs = load_cpickle_gc("train_vi_en_idx_pairs")
train_idx_pairs = train_idx_pairs[:-5]
val_idx_pairs = load_cpickle_gc("val_idx_pairs")
val_pairs = load_cpickle_gc("val_pairs")

print(len(train_idx_pairs))

Ejemplo n.º 3
0
# _, _, val_pairs = prepareData(
#     "iwslt-vi-en-processed/dev.vi",
#     "iwslt-vi-en-processed/dev.en",
#     input_lang = 'vi',
#     target_lang = 'en')

# val_idx_pairs = []
# for x in val_pairs:
#     indexed = list(tensorsFromPair(x, input_lang, target_lang))
#     val_idx_pairs.append(indexed)

# pickle.dump(val_pairs, open("val_pairs", "wb"))
# pickle.dump(val_idx_pairs, open("val_idx_pairs", "wb"))

input_lang = load_cpickle_gc("input_lang_vi")
target_lang = load_cpickle_gc("target_lang_en")
train_idx_pairs = load_cpickle_gc("train_vi_en_idx_pairs")
val_idx_pairs = load_cpickle_gc("val_idx_pairs")
val_pairs = load_cpickle_gc("val_pairs")

train_dataset = LanguagePairDataset(train_idx_pairs)
# is there anything in the train_idx_pairs that is only 0s right now instead of padding.
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=language_pair_dataset_collate_function,
)

hidden_size = 256
attn_model = 'dot'