def __init__(self, model: TransitionModel, smoothing=0., **kw): super(TFActionSeqDecoder, self).__init__(**kw) self.model = model if smoothing > 0: self.loss = q.SmoothedCELoss(reduction="none", ignore_index=0, mode="probs") else: self.loss = q.CELoss(reduction="none", ignore_index=0, mode="probs")
def test_it_3D(self): x = torch.randn(5, 3, 4) g = torch.randint(0, 4, (5, 3)).long() m = q.CELoss(mode="logits") l = m(x, g) print(l) # reference logprobs = torch.nn.LogSoftmax(-1)(x) logprobs = torch.gather(logprobs, -1, g.unsqueeze(-1)) lref = logprobs.mean() print(lref) self.assertTrue(l.item() == -lref.item())
def test_it(self): m = q.SmoothedCELoss(smoothing=0.2, mode="logits") x = torch.randn(5, 6) g = torch.randint(0, 6, (5, )).long() l = m(x, g) print(l) uniform = torch.ones_like(x) / x.size(1) # print(uniform) kl = torch.nn.KLDivLoss(reduction="none")(x, uniform).sum(-1).mean() ce = q.CELoss(mode="logits")(x, g) print(kl, ce) print(kl * 0.2 + ce * 0.8)
def test_it_5D_nored(self): x = torch.randn(5, 3, 4, 5, 4) g = torch.randint(0, 4, (5, 3, 4, 5)).long() m = q.CELoss(mode="logits", reduction="none") l = m(x, g) print(l.size()) # reference logprobs = torch.nn.LogSoftmax(-1)(x) logprobs = torch.gather(logprobs, -1, g.unsqueeze(-1)).squeeze(-1) print(logprobs.size()) self.assertTrue( np.allclose(l.detach().numpy(), -logprobs.detach().numpy()))
def test_equivalent_to_ce(self): m = q.DistillLoss(temperature=1., ignore_index=-100, mixture=0) probs = torch.randn(2, 3, 4) softgold = torch.randn(2, 3, 4) hardgold = torch.randint(1, 4, (2, 3)).long() l = m(probs, (softgold, hardgold)) print(l) # reference ce = q.CELoss(mode="logits")(probs, hardgold) print(ce) print(l.item() - ce.item()) self.assertTrue((l - ce).norm(1).item() < 1e-6)
def test_it_with_weights(self): weights = torch.tensor([0.1, 0.2, 0.3, 1., 1., 1.]) m = q.SmoothedCELoss(smoothing=0.2, mode="logits", weight=weights) x = torch.randn(5, 6) g = torch.randint(0, 6, (5, )).long() l = m(x, g) print(l) uniform = torch.ones_like(x) / x.size(1) # print(uniform) kl = torch.nn.KLDivLoss(reduction="none")(x, uniform).sum(-1).mean() ce = q.CELoss(mode="logits")(x, g) print(kl, ce) print(kl * 0.2 + ce * 0.8)
def test_it_5D_withmask(self): x = torch.randn(5, 3, 4, 5, 4) g = torch.randint(0, 4, (5, 3, 4, 5)).long() g[:, :, :, -1] = 0 m = q.CELoss(mode="logits", ignore_index=0) l = m(x, g) print(l) # reference mask = (1 - (g == 0).float()) logprobs = torch.nn.LogSoftmax(-1)(x + torch.log(mask.unsqueeze(-1))) logprobs = -torch.gather(logprobs, -1, g.unsqueeze(-1)).squeeze(-1) logprobs = nan2zero(logprobs) s = logprobs.sum() t = mask.sum() lref = s / t print(lref) self.assertTrue((l - lref).norm(1).item() < 1e-6)
def __init__(self, weight=None, reduction="mean", ignore_index=0, mode="logits", smoothing: float = 0., **kw): super(CELoss, self).__init__(**kw) self.mode = mode self.ce = q.CELoss(weight=weight, reduction=reduction, ignore_index=ignore_index, mode=mode) if smoothing != 0.: assert (smoothing < 1. and smoothing > 0.) assert (mode in ["logits", "logprobs"]) self.ce = q.SmoothedCELoss(reduction=reduction, ignore_index=ignore_index, smoothing=smoothing, mode=mode, weight=weight)
def run( lr=2.5e-4, edropout=0.1, wdropout=0.1, rdropout=0.1, adropout=0.1, dropout=-1., numlayers=2, numheads=8, abspos=False, tie_wordvecs=False, gradnorm=0.5, epochs=200, dim=256, seqlen=50, batsize=32, eval_batsize=64, cuda=False, gpu=0, test=True, subsampleeval=10, wreg=1e-6, lrcycle=5, lrwarmup=3, ): tt = q.ticktock("script") device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) tt.tick("loading data") train_batches, valid_batches, test_batches, D = \ load_data(batsize=batsize, eval_batsize=eval_batsize, seqlen=seqlen, subsample_eval=subsampleeval) tt.tock("data loaded") print("{} batches in train".format(len(train_batches))) if dropout >= 0.: edropout, adropout, rdropout, wdropout = dropout, dropout, dropout, dropout relpos = not abspos tt.tick("creating model") m = TransformerLM(dim=dim, worddic=D, numlayers=numlayers, numheads=numheads, activation=q.GeLU, embedding_dropout=edropout, attention_dropout=adropout, word_dropout=wdropout, residual_dropout=rdropout, relpos=relpos, tie_wordvecs=tie_wordvecs, maxlen=2 * seqlen).to(device) valid_m = TransformerLMCell(m) if test: for i, batch in enumerate(valid_batches): batch = [batch_e.to(device) for batch_e in batch] y = valid_m(batch[0]) if i > 5: break for i, batch in enumerate(valid_batches): pass print(i, batsize, seqlen, valid_batches.data.size(0)) print(y.size()) # return # return loss = q.LossWrapper(q.CELoss(mode="logits")) validloss = q.LossWrapper(q.CELoss(mode="logits")) validlosses = [validloss, PPLfromCE(validloss)] testloss = q.LossWrapper(q.CELoss(mode="logits")) testlosses = [testloss, PPLfromCE(testloss)] for l in [loss] + validlosses + testlosses: # put losses on right device l.loss.to(device) # optim = torch.optim.SGD(m.parameters(), lr=lr) numbats = len(train_batches) print("{} batches in training".format(numbats)) optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg) # lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=1/4, patience=0, verbose=True) # lrp_f = lambda: lrp.step(validloss.get_epoch_error()) sched = q.CosineLRwithWarmup(optim, lrcycle * numbats, warmup=lrwarmup * numbats) train_batch_f = partial( q.train_batch, on_before_optim_step=[ lambda: torch.nn.utils.clip_grad_norm_(m.parameters(), gradnorm), lambda: sched.step() ]) train_epoch_f = partial(q.train_epoch, model=m, dataloader=train_batches, optim=optim, losses=[loss], device=device, _train_batch=train_batch_f) valid_epoch_f = partial(q.test_epoch, model=valid_m, dataloader=valid_batches, losses=validlosses, device=device) tt.tock("created model") tt.tick("training") q.run_training(train_epoch_f, valid_epoch_f, max_epochs=epochs, validinter=1) tt.tock("trained") tt.tick("testing") testresults = q.test_epoch(model=valid_m, dataloader=test_batches, losses=testlosses, device=device) print(testresults) tt.tock("tested")
def run( lr=20., dropout=0.2, dropconnect=0.2, gradnorm=0.25, epochs=25, embdim=200, encdim=200, numlayers=2, tieweights=False, distill="glove", # "rnnlm", "glove" seqlen=35, batsize=20, eval_batsize=80, cuda=False, gpu=0, test=False, repretrain=False, # retrain base model instead of loading it savepath="rnnlm.base.pt", # where to save after training glovepath="../../../data/glove/glove.300d"): tt = q.ticktock("script") device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) tt.tick("loading data") train_batches, valid_batches, test_batches, D = \ load_data(batsize=batsize, eval_batsize=eval_batsize, seqlen=VariableSeqlen(minimum=5, maximum_offset=10, mu=seqlen, sigma=0)) tt.tock("data loaded") print("{} batches in train".format(len(train_batches))) # region base training loss = q.LossWrapper(q.CELoss(mode="logits")) validloss = q.LossWrapper(q.CELoss(mode="logits")) validlosses = [validloss, PPLfromCE(validloss)] testloss = q.LossWrapper(q.CELoss(mode="logits")) testlosses = [testloss, PPLfromCE(testloss)] for l in [loss] + validlosses + testlosses: # put losses on right device l.loss.to(device) if os.path.exists(savepath) and repretrain is False: tt.tick("reloading base model") with open(savepath, "rb") as f: m = torch.load(f) m.to(device) tt.tock("reloaded base model") else: tt.tick("preparing training base") dims = [embdim] + ([encdim] * numlayers) m = RNNLayer_LM(*dims, worddic=D, dropout=dropout, tieweights=tieweights).to(device) if test: for i, batch in enumerate(train_batches): y = m(batch[0]) if i > 5: break print(y.size()) optim = torch.optim.SGD(m.parameters(), lr=lr) train_batch_f = partial(q.train_batch, on_before_optim_step=[ lambda: torch.nn.utils.clip_grad_norm_( m.parameters(), gradnorm) ]) lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=1 / 4, patience=0, verbose=True) lrp_f = lambda: lrp.step(validloss.get_epoch_error()) train_epoch_f = partial(q.train_epoch, model=m, dataloader=train_batches, optim=optim, losses=[loss], device=device, _train_batch=train_batch_f) valid_epoch_f = partial(q.test_epoch, model=m, dataloader=valid_batches, losses=validlosses, device=device, on_end=[lrp_f]) tt.tock("prepared training base") tt.tick("training base model") q.run_training(train_epoch_f, valid_epoch_f, max_epochs=epochs, validinter=1) tt.tock("trained base model") with open(savepath, "wb") as f: torch.save(m, f) tt.tick("testing base model") testresults = q.test_epoch(model=m, dataloader=test_batches, losses=testlosses, device=device) print(testresults) tt.tock("tested base model") # endregion # region distillation tt.tick("preparing training student") dims = [embdim] + ([encdim] * numlayers) ms = RNNLayer_LM(*dims, worddic=D, dropout=dropout, tieweights=tieweights).to(device) loss = q.LossWrapper(q.DistillLoss(temperature=2.)) validloss = q.LossWrapper(q.CELoss(mode="logits")) validlosses = [validloss, PPLfromCE(validloss)] testloss = q.LossWrapper(q.CELoss(mode="logits")) testlosses = [testloss, PPLfromCE(testloss)] for l in [loss] + validlosses + testlosses: # put losses on right device l.loss.to(device) optim = torch.optim.SGD(ms.parameters(), lr=lr) train_batch_f = partial( train_batch_distill, on_before_optim_step=[ lambda: torch.nn.utils.clip_grad_norm_(ms.parameters(), gradnorm) ]) lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=1 / 4, patience=0, verbose=True) lrp_f = lambda: lrp.step(validloss.get_epoch_error()) if distill == "rnnlm": mbase = m goldgetter = None elif distill == "glove": mbase = None tt.tick("creating gold getter based on glove") goldgetter = GloveGoldGetter(glovepath, worddic=D) goldgetter.to(device) tt.tock("created gold getter") else: raise q.SumTingWongException("unknown distill mode {}".format(distill)) train_epoch_f = partial(train_epoch_distill, model=ms, dataloader=train_batches, optim=optim, losses=[loss], device=device, _train_batch=train_batch_f, mbase=mbase, goldgetter=goldgetter) valid_epoch_f = partial(q.test_epoch, model=ms, dataloader=valid_batches, losses=validlosses, device=device, on_end=[lrp_f]) tt.tock("prepared training student") tt.tick("training student model") q.run_training(train_epoch_f, valid_epoch_f, max_epochs=epochs, validinter=1) tt.tock("trained student model") tt.tick("testing student model") testresults = q.test_epoch(model=ms, dataloader=test_batches, losses=testlosses, device=device) print(testresults) tt.tock("tested student model")
def run_seq2seq_( lr=0.001, batsize=32, evalbatsize=256, epochs=100, warmup=5, embdim=50, encdim=100, numlayers=2, dropout=.0, wreg=1e-6, cuda=False, gpu=0, ): settings = locals().copy() device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt = q.ticktock("script") tt.msg("running seq2seq on LC-QuAD") tt.tick("loading data") xsm, ysm, teststart, tok2act = load_data() _tok2act = {ysm.RD[k]: v for k, v in tok2act.items()} print("Some examples:") for i in range(5): print( f"{xsm[i]}\n ->{ysm[i]}\n -> {Node.from_transitions(' '.join(ysm[i].split()[1:]), _tok2act)}" ) print("Non-leaf tokens:") print({ysm.RD[k]: v for k, v in tok2act.items() if v > 0}) devstart = teststart - 500 trainds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[:devstart]).long(), torch.tensor(ysm.matrix[:devstart, :-1]).long(), torch.tensor(ysm.matrix[:devstart, 1:]).long()) valds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[devstart:teststart]).long(), torch.tensor(ysm.matrix[devstart:teststart, :-1]).long(), torch.tensor(ysm.matrix[devstart:teststart, 1:]).long()) testds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[teststart:]).long(), torch.tensor(ysm.matrix[teststart:, :-1]).long(), torch.tensor(ysm.matrix[teststart:, 1:]).long()) tt.msg( f"Data splits: train: {len(trainds)}, valid: {len(valds)}, test: {len(testds)}" ) tloader = torch.utils.data.DataLoader(trainds, batch_size=batsize, shuffle=True) vloader = torch.utils.data.DataLoader(valds, batch_size=evalbatsize, shuffle=False) xloader = torch.utils.data.DataLoader(testds, batch_size=evalbatsize, shuffle=False) tt.tock("data loaded") # model enclayers, declayers = numlayers, numlayers decdim = encdim xemb = q.WordEmb(embdim, worddic=xsm.D) yemb = q.WordEmb(embdim, worddic=ysm.D) encdims = [embdim] + [encdim // 2] * enclayers xenc = q.LSTMEncoder(embdim, *encdims[1:], bidir=True, dropout_in_shared=dropout) decdims = [embdim] + [decdim] * declayers dec_core = torch.nn.Sequential(*[ q.LSTMCell(decdims[i - 1], decdims[i], dropout_in=dropout, dropout_rec=dropout) for i in range(1, len(decdims)) ]) yout = q.WordLinout(encdim + decdim, worddic=ysm.D) dec_cell = semparse.rnn.LuongCell(emb=yemb, core=dec_core, out=yout, dropout=dropout) decoder = q.TFDecoder(dec_cell) testdecoder = q.FreeDecoder(dec_cell, maxtime=100) m = Seq2Seq(xemb, xenc, decoder) testm = Seq2Seq(xemb, xenc, testdecoder, test=True) # test model tt.tick("running a batch") test_y = m(*iter(tloader).next()[:-1]) q.batch_reset(m) test_y = testm(*iter(vloader).next()[:-1]) q.batch_reset(m) tt.tock(f"ran a batch: {test_y.size()}") optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg) tlosses = [ q.CELoss(mode="logits", ignore_index=0), q.Accuracy(ignore_index=0), q.SeqAccuracy(ignore_index=0) ] xlosses = [ q.CELoss(mode="logits", ignore_index=0), q.Accuracy(ignore_index=0), q.SeqAccuracy(ignore_index=0) ] tlosses = [q.LossWrapper(l) for l in tlosses] vlosses = [q.LossWrapper(l) for l in xlosses] xlosses = [q.LossWrapper(l) for l in xlosses] trainloop = partial(q.train_epoch, model=m, dataloader=tloader, optim=optim, losses=tlosses, device=device) devloop = partial(q.test_epoch, model=testm, dataloader=vloader, losses=vlosses, device=device) testloop = partial(q.test_epoch, model=testm, dataloader=xloader, losses=xlosses, device=device) lrplateau = q.util.ReduceLROnPlateau(optim, mode="max", factor=.1, patience=3, cooldown=1, warmup=warmup, threshold=0., verbose=True, eps=1e-9) on_after_valid = [lambda: lrplateau.step(vlosses[1].get_epoch_error())] _devloop = partial(devloop, on_end=on_after_valid) stoptrain = [lambda: all([pg["lr"] <= 1e-7 for pg in optim.param_groups])] tt.tick("training") q.run_training(trainloop, _devloop, max_epochs=epochs, check_stop=stoptrain) tt.tock("done training") tt.tick("testing") testres = testloop() print(testres) settings["testres"] = testres tt.tock("tested") devres = devloop() print(devres, vlosses[0].get_epoch_error()) return vlosses[1].get_epoch_error()
def run(lr=20., dropout=0.2, dropconnect=0.2, gradnorm=0.25, epochs=25, embdim=200, encdim=200, numlayers=2, tieweights=False, seqlen=35, batsize=20, eval_batsize=80, cuda=False, gpu=0, test=False): tt = q.ticktock("script") device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) tt.tick("loading data") train_batches, valid_batches, test_batches, D = \ load_data(batsize=batsize, eval_batsize=eval_batsize, seqlen=VariableSeqlen(minimum=5, maximum_offset=10, mu=seqlen, sigma=0)) tt.tock("data loaded") print("{} batches in train".format(len(train_batches))) tt.tick("creating model") dims = [embdim] + ([encdim] * numlayers) m = RNNLayer_LM(*dims, worddic=D, dropout=dropout, tieweights=tieweights).to(device) if test: for i, batch in enumerate(train_batches): y = m(batch[0]) if i > 5: break print(y.size()) loss = q.LossWrapper(q.CELoss(mode="logits")) validloss = q.LossWrapper(q.CELoss(mode="logits")) validlosses = [validloss, PPLfromCE(validloss)] testloss = q.LossWrapper(q.CELoss(mode="logits")) testlosses = [testloss, PPLfromCE(testloss)] for l in [loss] + validlosses + testlosses: # put losses on right device l.loss.to(device) optim = torch.optim.SGD(m.parameters(), lr=lr) train_batch_f = partial( q.train_batch, on_before_optim_step=[ lambda: torch.nn.utils.clip_grad_norm_(m.parameters(), gradnorm) ]) lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=1 / 4, patience=0, verbose=True) lrp_f = lambda: lrp.step(validloss.get_epoch_error()) train_epoch_f = partial(q.train_epoch, model=m, dataloader=train_batches, optim=optim, losses=[loss], device=device, _train_batch=train_batch_f) valid_epoch_f = partial(q.test_epoch, model=m, dataloader=valid_batches, losses=validlosses, device=device, on_end=[lrp_f]) tt.tock("created model") tt.tick("training") q.run_training(train_epoch_f, valid_epoch_f, max_epochs=epochs, validinter=1) tt.tock("trained") tt.tick("testing") testresults = q.test_epoch(model=m, dataloader=test_batches, losses=testlosses, device=device) print(testresults) tt.tock("tested")