def train(): print("Loading data...") SRC, TGT, train, val, test = generate_dataloaders() devices = [0, 1, 2, 3] pad_idx = TGT.vocab.stoi["<blank>"] print("Making model...") model = make_model(len(SRC.vocab), len(TGT.vocab), N=6) model.cuda() criterion = LabelSmoothing( size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() BATCH_SIZE = 12000 train_iter = BatchIterator(train, batch_size=BATCH_SIZE, device=torch.device(0), repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = BatchIterator(val, batch_size=BATCH_SIZE, device=torch.device(0), repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) model_par = nn.DataParallel(model, device_ids=devices) model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) folder = get_unique_folder("./models/", "model") if not(os.path.exists(folder)): os.mkdir(folder) for epoch in tqdm(range(10)): model_par.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)) model_par.eval() loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None)) torch.save(model.state_dict, os.path.join(folder, "model.bin." + str(epoch))) print(loss) for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TGT.vocab.stoi["<s>"]) print("Translation:", end="\t") for i in range(1, out.size(1)): sym = TGT.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end=" ") print() print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = TGT.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end=" ") print() break
def rebatch(pad_idx, batch): "Fix order in torchtext to match ours" src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1) return Batch(src, trg, pad_idx) # GPUs to use devices = [0, 1, 2, 3] if True: pad_idx = TGT.vocab.stoi["<blank>"] model = make_model(len(SRC.vocab), len(TGT.vocab), N=6) model.cuda() criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() BATCH_SIZE = 12000 train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) model_par = nn.DataParallel(model, device_ids=devices) None if False: model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(10):