Example #1
0
 def __init__(self, model: TransitionModel, smoothing=0., **kw):
     super(TFActionSeqDecoder, self).__init__(**kw)
     self.model = model
     if smoothing > 0:
         self.loss = q.SmoothedCELoss(reduction="none",
                                      ignore_index=0,
                                      mode="probs")
     else:
         self.loss = q.CELoss(reduction="none",
                              ignore_index=0,
                              mode="probs")
Example #2
0
    def test_it_3D(self):
        x = torch.randn(5, 3, 4)
        g = torch.randint(0, 4, (5, 3)).long()
        m = q.CELoss(mode="logits")
        l = m(x, g)
        print(l)

        # reference
        logprobs = torch.nn.LogSoftmax(-1)(x)
        logprobs = torch.gather(logprobs, -1, g.unsqueeze(-1))
        lref = logprobs.mean()
        print(lref)
        self.assertTrue(l.item() == -lref.item())
Example #3
0
    def test_it(self):
        m = q.SmoothedCELoss(smoothing=0.2, mode="logits")
        x = torch.randn(5, 6)
        g = torch.randint(0, 6, (5, )).long()
        l = m(x, g)
        print(l)

        uniform = torch.ones_like(x) / x.size(1)
        # print(uniform)
        kl = torch.nn.KLDivLoss(reduction="none")(x, uniform).sum(-1).mean()
        ce = q.CELoss(mode="logits")(x, g)
        print(kl, ce)
        print(kl * 0.2 + ce * 0.8)
Example #4
0
    def test_it_5D_nored(self):
        x = torch.randn(5, 3, 4, 5, 4)
        g = torch.randint(0, 4, (5, 3, 4, 5)).long()
        m = q.CELoss(mode="logits", reduction="none")
        l = m(x, g)
        print(l.size())

        # reference
        logprobs = torch.nn.LogSoftmax(-1)(x)
        logprobs = torch.gather(logprobs, -1, g.unsqueeze(-1)).squeeze(-1)
        print(logprobs.size())

        self.assertTrue(
            np.allclose(l.detach().numpy(), -logprobs.detach().numpy()))
Example #5
0
    def test_equivalent_to_ce(self):
        m = q.DistillLoss(temperature=1., ignore_index=-100, mixture=0)
        probs = torch.randn(2, 3, 4)
        softgold = torch.randn(2, 3, 4)
        hardgold = torch.randint(1, 4, (2, 3)).long()
        l = m(probs, (softgold, hardgold))
        print(l)

        # reference
        ce = q.CELoss(mode="logits")(probs, hardgold)
        print(ce)

        print(l.item() - ce.item())
        self.assertTrue((l - ce).norm(1).item() < 1e-6)
Example #6
0
    def test_it_with_weights(self):
        weights = torch.tensor([0.1, 0.2, 0.3, 1., 1., 1.])
        m = q.SmoothedCELoss(smoothing=0.2, mode="logits", weight=weights)
        x = torch.randn(5, 6)
        g = torch.randint(0, 6, (5, )).long()
        l = m(x, g)
        print(l)

        uniform = torch.ones_like(x) / x.size(1)
        # print(uniform)
        kl = torch.nn.KLDivLoss(reduction="none")(x, uniform).sum(-1).mean()
        ce = q.CELoss(mode="logits")(x, g)
        print(kl, ce)
        print(kl * 0.2 + ce * 0.8)
Example #7
0
    def test_it_5D_withmask(self):
        x = torch.randn(5, 3, 4, 5, 4)
        g = torch.randint(0, 4, (5, 3, 4, 5)).long()
        g[:, :, :, -1] = 0
        m = q.CELoss(mode="logits", ignore_index=0)
        l = m(x, g)
        print(l)

        # reference
        mask = (1 - (g == 0).float())
        logprobs = torch.nn.LogSoftmax(-1)(x + torch.log(mask.unsqueeze(-1)))
        logprobs = -torch.gather(logprobs, -1, g.unsqueeze(-1)).squeeze(-1)
        logprobs = nan2zero(logprobs)
        s = logprobs.sum()
        t = mask.sum()
        lref = s / t
        print(lref)
        self.assertTrue((l - lref).norm(1).item() < 1e-6)
Example #8
0
 def __init__(self,
              weight=None,
              reduction="mean",
              ignore_index=0,
              mode="logits",
              smoothing: float = 0.,
              **kw):
     super(CELoss, self).__init__(**kw)
     self.mode = mode
     self.ce = q.CELoss(weight=weight,
                        reduction=reduction,
                        ignore_index=ignore_index,
                        mode=mode)
     if smoothing != 0.:
         assert (smoothing < 1. and smoothing > 0.)
         assert (mode in ["logits", "logprobs"])
         self.ce = q.SmoothedCELoss(reduction=reduction,
                                    ignore_index=ignore_index,
                                    smoothing=smoothing,
                                    mode=mode,
                                    weight=weight)
Example #9
0
def run(
    lr=2.5e-4,
    edropout=0.1,
    wdropout=0.1,
    rdropout=0.1,
    adropout=0.1,
    dropout=-1.,
    numlayers=2,
    numheads=8,
    abspos=False,
    tie_wordvecs=False,
    gradnorm=0.5,
    epochs=200,
    dim=256,
    seqlen=50,
    batsize=32,
    eval_batsize=64,
    cuda=False,
    gpu=0,
    test=True,
    subsampleeval=10,
    wreg=1e-6,
    lrcycle=5,
    lrwarmup=3,
):
    tt = q.ticktock("script")
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)
    tt.tick("loading data")
    train_batches, valid_batches, test_batches, D = \
        load_data(batsize=batsize, eval_batsize=eval_batsize, seqlen=seqlen, subsample_eval=subsampleeval)
    tt.tock("data loaded")
    print("{} batches in train".format(len(train_batches)))
    if dropout >= 0.:
        edropout, adropout, rdropout, wdropout = dropout, dropout, dropout, dropout
    relpos = not abspos

    tt.tick("creating model")

    m = TransformerLM(dim=dim,
                      worddic=D,
                      numlayers=numlayers,
                      numheads=numheads,
                      activation=q.GeLU,
                      embedding_dropout=edropout,
                      attention_dropout=adropout,
                      word_dropout=wdropout,
                      residual_dropout=rdropout,
                      relpos=relpos,
                      tie_wordvecs=tie_wordvecs,
                      maxlen=2 * seqlen).to(device)
    valid_m = TransformerLMCell(m)

    if test:
        for i, batch in enumerate(valid_batches):
            batch = [batch_e.to(device) for batch_e in batch]
            y = valid_m(batch[0])
            if i > 5:
                break
        for i, batch in enumerate(valid_batches):
            pass
        print(i, batsize, seqlen, valid_batches.data.size(0))
        print(y.size())
        # return
    # return

    loss = q.LossWrapper(q.CELoss(mode="logits"))
    validloss = q.LossWrapper(q.CELoss(mode="logits"))
    validlosses = [validloss, PPLfromCE(validloss)]
    testloss = q.LossWrapper(q.CELoss(mode="logits"))
    testlosses = [testloss, PPLfromCE(testloss)]
    for l in [loss] + validlosses + testlosses:  # put losses on right device
        l.loss.to(device)

    # optim = torch.optim.SGD(m.parameters(), lr=lr)
    numbats = len(train_batches)
    print("{} batches in training".format(numbats))
    optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg)
    # lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=1/4, patience=0, verbose=True)
    # lrp_f = lambda: lrp.step(validloss.get_epoch_error())
    sched = q.CosineLRwithWarmup(optim,
                                 lrcycle * numbats,
                                 warmup=lrwarmup * numbats)

    train_batch_f = partial(
        q.train_batch,
        on_before_optim_step=[
            lambda: torch.nn.utils.clip_grad_norm_(m.parameters(), gradnorm),
            lambda: sched.step()
        ])
    train_epoch_f = partial(q.train_epoch,
                            model=m,
                            dataloader=train_batches,
                            optim=optim,
                            losses=[loss],
                            device=device,
                            _train_batch=train_batch_f)
    valid_epoch_f = partial(q.test_epoch,
                            model=valid_m,
                            dataloader=valid_batches,
                            losses=validlosses,
                            device=device)
    tt.tock("created model")
    tt.tick("training")
    q.run_training(train_epoch_f,
                   valid_epoch_f,
                   max_epochs=epochs,
                   validinter=1)
    tt.tock("trained")

    tt.tick("testing")
    testresults = q.test_epoch(model=valid_m,
                               dataloader=test_batches,
                               losses=testlosses,
                               device=device)
    print(testresults)
    tt.tock("tested")
Example #10
0
def run(
        lr=20.,
        dropout=0.2,
        dropconnect=0.2,
        gradnorm=0.25,
        epochs=25,
        embdim=200,
        encdim=200,
        numlayers=2,
        tieweights=False,
        distill="glove",  # "rnnlm", "glove"
        seqlen=35,
        batsize=20,
        eval_batsize=80,
        cuda=False,
        gpu=0,
        test=False,
        repretrain=False,  # retrain base model instead of loading it
        savepath="rnnlm.base.pt",  # where to save after training
        glovepath="../../../data/glove/glove.300d"):
    tt = q.ticktock("script")
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)
    tt.tick("loading data")
    train_batches, valid_batches, test_batches, D = \
        load_data(batsize=batsize, eval_batsize=eval_batsize,
                  seqlen=VariableSeqlen(minimum=5, maximum_offset=10, mu=seqlen, sigma=0))
    tt.tock("data loaded")
    print("{} batches in train".format(len(train_batches)))

    # region base training
    loss = q.LossWrapper(q.CELoss(mode="logits"))
    validloss = q.LossWrapper(q.CELoss(mode="logits"))
    validlosses = [validloss, PPLfromCE(validloss)]
    testloss = q.LossWrapper(q.CELoss(mode="logits"))
    testlosses = [testloss, PPLfromCE(testloss)]

    for l in [loss] + validlosses + testlosses:  # put losses on right device
        l.loss.to(device)

    if os.path.exists(savepath) and repretrain is False:
        tt.tick("reloading base model")
        with open(savepath, "rb") as f:
            m = torch.load(f)
            m.to(device)
        tt.tock("reloaded base model")
    else:
        tt.tick("preparing training base")
        dims = [embdim] + ([encdim] * numlayers)

        m = RNNLayer_LM(*dims,
                        worddic=D,
                        dropout=dropout,
                        tieweights=tieweights).to(device)

        if test:
            for i, batch in enumerate(train_batches):
                y = m(batch[0])
                if i > 5:
                    break
            print(y.size())

        optim = torch.optim.SGD(m.parameters(), lr=lr)

        train_batch_f = partial(q.train_batch,
                                on_before_optim_step=[
                                    lambda: torch.nn.utils.clip_grad_norm_(
                                        m.parameters(), gradnorm)
                                ])
        lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                         mode="min",
                                                         factor=1 / 4,
                                                         patience=0,
                                                         verbose=True)
        lrp_f = lambda: lrp.step(validloss.get_epoch_error())

        train_epoch_f = partial(q.train_epoch,
                                model=m,
                                dataloader=train_batches,
                                optim=optim,
                                losses=[loss],
                                device=device,
                                _train_batch=train_batch_f)
        valid_epoch_f = partial(q.test_epoch,
                                model=m,
                                dataloader=valid_batches,
                                losses=validlosses,
                                device=device,
                                on_end=[lrp_f])

        tt.tock("prepared training base")
        tt.tick("training base model")
        q.run_training(train_epoch_f,
                       valid_epoch_f,
                       max_epochs=epochs,
                       validinter=1)
        tt.tock("trained base model")

        with open(savepath, "wb") as f:
            torch.save(m, f)

    tt.tick("testing base model")
    testresults = q.test_epoch(model=m,
                               dataloader=test_batches,
                               losses=testlosses,
                               device=device)
    print(testresults)
    tt.tock("tested base model")
    # endregion

    # region distillation
    tt.tick("preparing training student")
    dims = [embdim] + ([encdim] * numlayers)
    ms = RNNLayer_LM(*dims, worddic=D, dropout=dropout,
                     tieweights=tieweights).to(device)

    loss = q.LossWrapper(q.DistillLoss(temperature=2.))
    validloss = q.LossWrapper(q.CELoss(mode="logits"))
    validlosses = [validloss, PPLfromCE(validloss)]
    testloss = q.LossWrapper(q.CELoss(mode="logits"))
    testlosses = [testloss, PPLfromCE(testloss)]

    for l in [loss] + validlosses + testlosses:  # put losses on right device
        l.loss.to(device)

    optim = torch.optim.SGD(ms.parameters(), lr=lr)

    train_batch_f = partial(
        train_batch_distill,
        on_before_optim_step=[
            lambda: torch.nn.utils.clip_grad_norm_(ms.parameters(), gradnorm)
        ])
    lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                     mode="min",
                                                     factor=1 / 4,
                                                     patience=0,
                                                     verbose=True)
    lrp_f = lambda: lrp.step(validloss.get_epoch_error())

    if distill == "rnnlm":
        mbase = m
        goldgetter = None
    elif distill == "glove":
        mbase = None
        tt.tick("creating gold getter based on glove")
        goldgetter = GloveGoldGetter(glovepath, worddic=D)
        goldgetter.to(device)
        tt.tock("created gold getter")
    else:
        raise q.SumTingWongException("unknown distill mode {}".format(distill))

    train_epoch_f = partial(train_epoch_distill,
                            model=ms,
                            dataloader=train_batches,
                            optim=optim,
                            losses=[loss],
                            device=device,
                            _train_batch=train_batch_f,
                            mbase=mbase,
                            goldgetter=goldgetter)
    valid_epoch_f = partial(q.test_epoch,
                            model=ms,
                            dataloader=valid_batches,
                            losses=validlosses,
                            device=device,
                            on_end=[lrp_f])

    tt.tock("prepared training student")
    tt.tick("training student model")
    q.run_training(train_epoch_f,
                   valid_epoch_f,
                   max_epochs=epochs,
                   validinter=1)
    tt.tock("trained student model")

    tt.tick("testing student model")
    testresults = q.test_epoch(model=ms,
                               dataloader=test_batches,
                               losses=testlosses,
                               device=device)
    print(testresults)
    tt.tock("tested student model")
Example #11
0
def run_seq2seq_(
    lr=0.001,
    batsize=32,
    evalbatsize=256,
    epochs=100,
    warmup=5,
    embdim=50,
    encdim=100,
    numlayers=2,
    dropout=.0,
    wreg=1e-6,
    cuda=False,
    gpu=0,
):
    settings = locals().copy()
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt = q.ticktock("script")
    tt.msg("running seq2seq on LC-QuAD")

    tt.tick("loading data")
    xsm, ysm, teststart, tok2act = load_data()
    _tok2act = {ysm.RD[k]: v for k, v in tok2act.items()}

    print("Some examples:")
    for i in range(5):
        print(
            f"{xsm[i]}\n ->{ysm[i]}\n -> {Node.from_transitions(' '.join(ysm[i].split()[1:]), _tok2act)}"
        )

    print("Non-leaf tokens:")
    print({ysm.RD[k]: v for k, v in tok2act.items() if v > 0})

    devstart = teststart - 500
    trainds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[:devstart]).long(),
        torch.tensor(ysm.matrix[:devstart, :-1]).long(),
        torch.tensor(ysm.matrix[:devstart, 1:]).long())
    valds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[devstart:teststart]).long(),
        torch.tensor(ysm.matrix[devstart:teststart, :-1]).long(),
        torch.tensor(ysm.matrix[devstart:teststart, 1:]).long())
    testds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[teststart:]).long(),
        torch.tensor(ysm.matrix[teststart:, :-1]).long(),
        torch.tensor(ysm.matrix[teststart:, 1:]).long())
    tt.msg(
        f"Data splits: train: {len(trainds)}, valid: {len(valds)}, test: {len(testds)}"
    )

    tloader = torch.utils.data.DataLoader(trainds,
                                          batch_size=batsize,
                                          shuffle=True)
    vloader = torch.utils.data.DataLoader(valds,
                                          batch_size=evalbatsize,
                                          shuffle=False)
    xloader = torch.utils.data.DataLoader(testds,
                                          batch_size=evalbatsize,
                                          shuffle=False)
    tt.tock("data loaded")

    # model
    enclayers, declayers = numlayers, numlayers
    decdim = encdim
    xemb = q.WordEmb(embdim, worddic=xsm.D)
    yemb = q.WordEmb(embdim, worddic=ysm.D)
    encdims = [embdim] + [encdim // 2] * enclayers
    xenc = q.LSTMEncoder(embdim,
                         *encdims[1:],
                         bidir=True,
                         dropout_in_shared=dropout)
    decdims = [embdim] + [decdim] * declayers
    dec_core = torch.nn.Sequential(*[
        q.LSTMCell(decdims[i - 1],
                   decdims[i],
                   dropout_in=dropout,
                   dropout_rec=dropout) for i in range(1, len(decdims))
    ])
    yout = q.WordLinout(encdim + decdim, worddic=ysm.D)
    dec_cell = semparse.rnn.LuongCell(emb=yemb,
                                      core=dec_core,
                                      out=yout,
                                      dropout=dropout)
    decoder = q.TFDecoder(dec_cell)
    testdecoder = q.FreeDecoder(dec_cell, maxtime=100)

    m = Seq2Seq(xemb, xenc, decoder)
    testm = Seq2Seq(xemb, xenc, testdecoder, test=True)

    # test model
    tt.tick("running a batch")
    test_y = m(*iter(tloader).next()[:-1])
    q.batch_reset(m)
    test_y = testm(*iter(vloader).next()[:-1])
    q.batch_reset(m)
    tt.tock(f"ran a batch: {test_y.size()}")

    optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg)
    tlosses = [
        q.CELoss(mode="logits", ignore_index=0),
        q.Accuracy(ignore_index=0),
        q.SeqAccuracy(ignore_index=0)
    ]
    xlosses = [
        q.CELoss(mode="logits", ignore_index=0),
        q.Accuracy(ignore_index=0),
        q.SeqAccuracy(ignore_index=0)
    ]
    tlosses = [q.LossWrapper(l) for l in tlosses]
    vlosses = [q.LossWrapper(l) for l in xlosses]
    xlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=m,
                        dataloader=tloader,
                        optim=optim,
                        losses=tlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=testm,
                      dataloader=vloader,
                      losses=vlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=testm,
                       dataloader=xloader,
                       losses=xlosses,
                       device=device)

    lrplateau = q.util.ReduceLROnPlateau(optim,
                                         mode="max",
                                         factor=.1,
                                         patience=3,
                                         cooldown=1,
                                         warmup=warmup,
                                         threshold=0.,
                                         verbose=True,
                                         eps=1e-9)
    on_after_valid = [lambda: lrplateau.step(vlosses[1].get_epoch_error())]
    _devloop = partial(devloop, on_end=on_after_valid)
    stoptrain = [lambda: all([pg["lr"] <= 1e-7 for pg in optim.param_groups])]

    tt.tick("training")
    q.run_training(trainloop,
                   _devloop,
                   max_epochs=epochs,
                   check_stop=stoptrain)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    settings["testres"] = testres
    tt.tock("tested")

    devres = devloop()
    print(devres, vlosses[0].get_epoch_error())

    return vlosses[1].get_epoch_error()
Example #12
0
def run(lr=20.,
        dropout=0.2,
        dropconnect=0.2,
        gradnorm=0.25,
        epochs=25,
        embdim=200,
        encdim=200,
        numlayers=2,
        tieweights=False,
        seqlen=35,
        batsize=20,
        eval_batsize=80,
        cuda=False,
        gpu=0,
        test=False):
    tt = q.ticktock("script")
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)
    tt.tick("loading data")
    train_batches, valid_batches, test_batches, D = \
        load_data(batsize=batsize, eval_batsize=eval_batsize,
                  seqlen=VariableSeqlen(minimum=5, maximum_offset=10, mu=seqlen, sigma=0))
    tt.tock("data loaded")
    print("{} batches in train".format(len(train_batches)))

    tt.tick("creating model")
    dims = [embdim] + ([encdim] * numlayers)

    m = RNNLayer_LM(*dims, worddic=D, dropout=dropout,
                    tieweights=tieweights).to(device)

    if test:
        for i, batch in enumerate(train_batches):
            y = m(batch[0])
            if i > 5:
                break
        print(y.size())

    loss = q.LossWrapper(q.CELoss(mode="logits"))
    validloss = q.LossWrapper(q.CELoss(mode="logits"))
    validlosses = [validloss, PPLfromCE(validloss)]
    testloss = q.LossWrapper(q.CELoss(mode="logits"))
    testlosses = [testloss, PPLfromCE(testloss)]

    for l in [loss] + validlosses + testlosses:  # put losses on right device
        l.loss.to(device)

    optim = torch.optim.SGD(m.parameters(), lr=lr)

    train_batch_f = partial(
        q.train_batch,
        on_before_optim_step=[
            lambda: torch.nn.utils.clip_grad_norm_(m.parameters(), gradnorm)
        ])
    lrp = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                     mode="min",
                                                     factor=1 / 4,
                                                     patience=0,
                                                     verbose=True)
    lrp_f = lambda: lrp.step(validloss.get_epoch_error())

    train_epoch_f = partial(q.train_epoch,
                            model=m,
                            dataloader=train_batches,
                            optim=optim,
                            losses=[loss],
                            device=device,
                            _train_batch=train_batch_f)
    valid_epoch_f = partial(q.test_epoch,
                            model=m,
                            dataloader=valid_batches,
                            losses=validlosses,
                            device=device,
                            on_end=[lrp_f])

    tt.tock("created model")
    tt.tick("training")
    q.run_training(train_epoch_f,
                   valid_epoch_f,
                   max_epochs=epochs,
                   validinter=1)
    tt.tock("trained")

    tt.tick("testing")
    testresults = q.test_epoch(model=m,
                               dataloader=test_batches,
                               losses=testlosses,
                               device=device)
    print(testresults)
    tt.tock("tested")