def create_model(embdim=100,
                 hdim=100,
                 dropout=0.,
                 numlayers: int = 1,
                 sentence_encoder: SentenceEncoder = None,
                 query_encoder: SentenceEncoder = None,
                 smoothing: float = 0.,
                 feedatt=False):
    inpemb = torch.nn.Embedding(sentence_encoder.vocab.number_of_ids(),
                                embdim,
                                padding_idx=0)
    inpemb = TokenEmb(inpemb,
                      rare_token_ids=sentence_encoder.vocab.rare_ids,
                      rare_id=1)
    encoder_dim = hdim
    encoder = q.LSTMEncoder(embdim,
                            *([encoder_dim // 2] * numlayers),
                            bidir=True,
                            dropout_in=dropout)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(embdim, hdim, num_layers=numlayers, bidirectional=True, batch_first=True,
    #                   dropout=dropout))
    decoder_emb = torch.nn.Embedding(query_encoder.vocab.number_of_ids(),
                                     embdim,
                                     padding_idx=0)
    decoder_emb = TokenEmb(decoder_emb,
                           rare_token_ids=query_encoder.vocab.rare_ids,
                           rare_id=1)
    dec_rnn_in_dim = embdim + (encoder_dim if feedatt else 0)
    decoder_rnn = [torch.nn.LSTMCell(dec_rnn_in_dim, hdim)]
    for i in range(numlayers - 1):
        decoder_rnn.append(torch.nn.LSTMCell(hdim, hdim))
    decoder_rnn = LSTMCellTransition(*decoder_rnn, dropout=dropout)
    decoder_out = BasicGenOutput(hdim + encoder_dim, sentence_encoder.vocab,
                                 query_encoder.vocab)
    attention = q.Attention(q.MatMulDotAttComp(hdim, encoder_dim))
    enctodec = torch.nn.Sequential(torch.nn.Linear(encoder_dim, hdim),
                                   torch.nn.Tanh())
    model = BasicPtrGenModel(inpemb,
                             encoder,
                             decoder_emb,
                             decoder_rnn,
                             decoder_out,
                             attention,
                             dropout=dropout,
                             enc_to_dec=enctodec,
                             feedatt=feedatt)
    dec = TFTokenSeqDecoder(model, smoothing=smoothing)
    return dec
Esempio n. 2
0
 def __init__(self,
              qrydim=None,
              ctxdim=None,
              encdim=None,
              dropout=0.,
              numlayers=1,
              **kw):
     super(SigmoidLSTMAttComp, self).__init__(**kw)
     encdims = [encdim] * numlayers
     self.layers = q.LSTMEncoder(qrydim + ctxdim,
                                 *encdims,
                                 bidir=False,
                                 dropout_in=dropout)
     self.lin = torch.nn.Linear(encdim, 1)
     self.act = torch.nn.Sigmoid()
Esempio n. 3
0
    def __init__(self,
                 embdim,
                 hdim,
                 numlayers: int = 1,
                 dropout=0.,
                 sentence_encoder: SequenceEncoder = None,
                 query_encoder: SequenceEncoder = None,
                 feedatt=False,
                 store_attn=True,
                 **kw):
        super(BasicGenModel, self).__init__(**kw)

        inpemb = torch.nn.Embedding(sentence_encoder.vocab.number_of_ids(),
                                    300,
                                    padding_idx=0)
        inpemb = TokenEmb(inpemb,
                          adapt_dims=(300, embdim),
                          rare_token_ids=sentence_encoder.vocab.rare_ids,
                          rare_id=1)
        _, covered_word_ids = load_pretrained_embeddings(
            inpemb.emb,
            sentence_encoder.vocab.D,
            p="../../data/glove/glove300uncased"
        )  # load glove embeddings where possible into the inner embedding class
        inpemb._do_rare(inpemb.rare_token_ids - covered_word_ids)
        self.inp_emb = inpemb

        encoder_dim = hdim
        encoder = q.LSTMEncoder(embdim,
                                *([encoder_dim // 2] * numlayers),
                                bidir=True,
                                dropout_in=dropout)
        self.inp_enc = encoder

        decoder_emb = torch.nn.Embedding(query_encoder.vocab.number_of_ids(),
                                         embdim,
                                         padding_idx=0)
        decoder_emb = TokenEmb(decoder_emb,
                               rare_token_ids=query_encoder.vocab.rare_ids,
                               rare_id=1)
        self.out_emb = decoder_emb

        dec_rnn_in_dim = embdim + (encoder_dim if feedatt else 0)
        decoder_rnn = [torch.nn.LSTMCell(dec_rnn_in_dim, hdim)]
        for i in range(numlayers - 1):
            decoder_rnn.append(torch.nn.LSTMCell(hdim, hdim))
        decoder_rnn = LSTMCellTransition(*decoder_rnn, dropout=dropout)
        self.out_rnn = decoder_rnn

        decoder_out = BasicGenOutput(hdim + encoder_dim,
                                     vocab=query_encoder.vocab)
        # decoder_out.build_copy_maps(inp_vocab=sentence_encoder.vocab)
        self.out_lin = decoder_out

        self.att = q.Attention(q.MatMulDotAttComp(hdim, encoder_dim))

        self.enc_to_dec = torch.nn.Sequential(
            torch.nn.Linear(encoder_dim, hdim), torch.nn.Tanh())

        self.feedatt = feedatt
        self.nocopy = True

        self.store_attn = store_attn
Esempio n. 4
0
def run_rerank(
    lr=0.001,
    batsize=20,
    epochs=1,
    embdim=301,  # not used
    encdim=200,
    numlayers=1,
    beamsize=5,
    dropout=.2,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=2,
    gradnorm=3.,
    cosine_restarts=1.,
    domain="restaurants",
    gensavedp="overnight_basic/run{}",
    genrunid=1,
):
    localargs = locals().copy()
    print(locals())
    gensavedrunp = gensavedp.format(genrunid)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    ds = q.load_dataset(gensavedrunp)
    # ds = OvernightDataset(domain=domain, sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer), min_freq=minfreq)
    print(
        f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)")
    tt.tock("data loaded")

    do_rare_stats(ds)
    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    genmodel, genargs = q.load_run(gensavedrunp)
    # BasicGenModel(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers,
    #                          sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True)

    # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids])
    # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens)

    inpenc = q.LSTMEncoder(embdim,
                           *([encdim // 2] * numlayers),
                           bidir=True,
                           dropout_in=dropout)
    outenc = q.LSTMEncoder(embdim,
                           *([encdim // 2] * numlayers),
                           bidir=True,
                           dropout_in=dropout)
    scoremodel = SimpleScoreModel(genmodel.inp_emb, genmodel.out_emb,
                                  LSTMEncoderWrapper(inpenc),
                                  LSTMEncoderWrapper(outenc), DotSimilarity())

    model = BeamReranker(genmodel, scoremodel, beamsize=beamsize, maxtime=50)

    # todo: run over whole dataset to populate beam cache
    testbatch = next(iter(ds.dataloader("train", batsize=2)))
    model(testbatch)

    sys.exit()

    tfdecoder = SeqDecoder(TFTransition(model), [
        CELoss(ignore_index=0, mode="logprobs"),
        SeqAccuracies(),
        TreeAccuracy(tensor2tree=partial(tensor2tree,
                                         D=ds.query_encoder.vocab),
                     orderless={"op:and", "SW:concat"})
    ])
    # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50)
    freedecoder = BeamDecoder(
        model,
        maxtime=50,
        beamsize=beamsize,
        eval_beam=[
            TreeAccuracy(tensor2tree=partial(tensor2tree,
                                             D=ds.query_encoder.vocab),
                         orderless={"op:and", "SW:concat"})
        ])

    # # test
    # tt.tick("doing one epoch")
    # for batch in iter(train_dl):
    #     batch = batch.to(device)
    #     ttt.tick("start batch")
    #     # with torch.no_grad():
    #     out = tfdecoder(batch)
    #     ttt.tock("end batch")
    # tt.tock("done one epoch")
    # print(out)
    # sys.exit()

    # beamdecoder(next(iter(train_dl)))

    # print(dict(tfdecoder.named_parameters()).keys())

    losses = make_array_of_metrics("loss", "seq_acc", "tree_acc")
    vlosses = make_array_of_metrics("tree_acc", "tree_acc_at3",
                                    "tree_acc_at_last")

    trainable_params = tfdecoder.named_parameters()
    exclude_params = {"model.model.inp_emb.emb.weight"
                      }  # don't train input embeddings if doing glove
    trainable_params = [
        v for k, v in trainable_params if k not in exclude_params
    ]

    # 4. define optim
    optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg)
    # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max}")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        tfdecoder.parameters(), gradnorm)
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch,
                         model=tfdecoder,
                         dataloader=ds.dataloader("train", batsize),
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=freedecoder,
                         dataloader=ds.dataloader("valid", batsize),
                         losses=vlosses,
                         device=device)
    # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device)

    # p = q.save_run(freedecoder, localargs, filepath=__file__)
    # q.save_dataset(ds, p)
    # _freedecoder, _localargs = q.load_run(p)
    # _ds = q.load_dataset(p)
    # sys.exit()

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")

    # testing
    tt.tick("testing")
    testresults = q.test_epoch(model=freedecoder,
                               dataloader=ds.dataloader("test", batsize),
                               losses=vlosses,
                               device=device)
    print(testresults)
    tt.tock("tested")

    # save model?
    tosave = input(
        "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>"
    )
    if tosave.lower() == "y" or tosave.lower() == "yes" or re.match(
            "\d+", tosave.lower()):
        overwrite = int(tosave) if re.match("\d+", tosave) else None
        p = q.save_run(model,
                       localargs,
                       filepath=__file__,
                       overwrite=overwrite)
        q.save_dataset(ds, p)
        _model, _localargs = q.load_run(p)
        _ds = q.load_dataset(p)

        _freedecoder = BeamDecoder(
            _model,
            maxtime=50,
            beamsize=beamsize,
            eval_beam=[
                TreeAccuracy(tensor2tree=partial(tensor2tree,
                                                 D=ds.query_encoder.vocab),
                             orderless={"op:and", "SW:concat"})
            ])

        # testing
        tt.tick("testing reloaded")
        _testresults = q.test_epoch(model=_freedecoder,
                                    dataloader=_ds.dataloader("test", batsize),
                                    losses=vlosses,
                                    device=device)
        print(_testresults)
        assert (testresults == _testresults)
        tt.tock("tested")
Esempio n. 5
0
def run_gatedtree(
    lr=0.01,
    gradclip=5.,
    batsize=20,
    epochs=80,
    embdim=200,
    encdim=200,
    numlayer=1,
    cuda=False,
    gpu=0,
    wreg=1e-8,
    dropout=0.5,
    smoothing=0.4,
    goldsmoothing=-0.1,
    which="geo",
    relatt=False,
):
    tt = q.ticktock("script")
    tt.msg("running gated tree decoder")
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)

    # region data
    tt.tick("generating data")
    # dss, D = gen_sort_data(seqlen=seqlen, numvoc=numvoc, numex=numex, prepend_inp=False)
    dss, nlD, flD = gen_datasets(which=which)
    tloader, vloader, xloader = [
        torch.utils.data.DataLoader(ds, batch_size=batsize, shuffle=True)
        for ds in dss
    ]
    seqlen = len(dss[0][0][1])
    id2pushpop = torch.zeros(len(flD), dtype=torch.long, device=device)
    id2pushpop[flD["("]] = +1
    id2pushpop[flD[")"]] = -1

    tt.tock("data generated")
    # endregion

    # region model
    tt.tick("building model")
    # source side
    inpemb = q.WordEmb(embdim, worddic=nlD)
    encdims = [encdim] * numlayer
    encoder = q.LSTMEncoder(embdim,
                            *encdims,
                            bidir=False,
                            dropout_in_shared=dropout)

    # target side
    decemb = q.WordEmb(embdim, worddic=flD)
    decinpdim = embdim
    decdims = [decinpdim] + [encdim] * numlayer
    dec_core = \
        [GatedTreeLSTMCell(decdims[i-1], decdims[i], dropout_in=dropout) for i in range(1, len(decdims))]        ###
    dec_core = TreeRNNDecoderCellCore(*dec_core)
    if relatt:
        att = ComboAbsRelAttention(ctxdim=encdim, vecdim=encdim)
    else:
        att = BasicAttention()
    out = torch.nn.Sequential(q.WordLinout(encdim, worddic=flD),
                              # torch.nn.Softmax(-1)
                              )
    merge = q.rnn.FwdDecCellMerge(decdims[-1], encdims[-1], outdim=encdim)
    deccell = TreeRNNDecoderCell(emb=decemb,
                                 core=dec_core,
                                 att=att,
                                 out=out,
                                 merge=merge,
                                 id2pushpop=id2pushpop)
    train_dec = q.TFDecoder(deccell)
    test_dec = q.FreeDecoder(deccell, maxtime=seqlen + 10)
    train_encdec = EncDec(inpemb, encoder, train_dec)
    test_encdec = Test_EncDec(inpemb, encoder, test_dec)

    train_encdec.to(device)
    test_encdec.to(device)
    tt.tock("built model")
    # endregion

    # region training
    # losses:
    if smoothing == 0:
        ce = q.loss.CELoss(mode="logits", ignore_index=0)
    elif goldsmoothing < 0.:
        ce = q.loss.SmoothedCELoss(mode="logits",
                                   ignore_index=0,
                                   smoothing=smoothing)
    else:
        ce = q.loss.DiffSmoothedCELoss(mode="logits",
                                       ignore_index=0,
                                       alpha=goldsmoothing,
                                       beta=smoothing)
    acc = q.loss.SeqAccuracy(ignore_index=0)
    elemacc = q.loss.SeqElemAccuracy(ignore_index=0)
    treeacc = TreeAccuracyLambdaDFPar(flD=flD)
    # optim
    optim = torch.optim.RMSprop(train_encdec.parameters(),
                                lr=lr,
                                alpha=0.95,
                                weight_decay=wreg)
    clipgradnorm = lambda: torch.nn.utils.clip_grad_value_(
        train_encdec.parameters(), clip_value=gradclip)
    # lööps
    batchloop = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainloop = partial(
        q.train_epoch,
        model=train_encdec,
        dataloader=tloader,
        optim=optim,
        device=device,
        losses=[q.LossWrapper(ce),
                q.LossWrapper(elemacc),
                q.LossWrapper(acc)],
        print_every_batch=False,
        _train_batch=batchloop)
    validloop = partial(q.test_epoch,
                        model=test_encdec,
                        dataloader=vloader,
                        device=device,
                        losses=[q.LossWrapper(treeacc)],
                        print_every_batch=False)

    tt.tick("training")
    q.run_training(trainloop, validloop, max_epochs=epochs)
    tt.tock("trained")

    tt.tick("testing")
    test_results = validloop(model=test_encdec, dataloader=xloader)
    print("Test results (freerunning): {}".format(test_results))
    test_results = validloop(model=train_encdec, dataloader=xloader)
    print("Test results (TF): {}".format(test_results))
    tt.tock("tested")
    # endregion
    tt.msg("done")
Esempio n. 6
0
def run_normal(lr=0.001,
               gradclip=5.,
               batsize=20,
               epochs=150,
               embdim=100,
               encdim=200,
               numlayer=1,
               cuda=False,
               gpu=0,
               wreg=1e-8,
               dropout=0.5,
               smoothing=0.,
               goldsmoothing=-0.1,
               selfptr=False,
               which="geo"):
    tt = q.ticktock("script")
    tt.msg("running normal att")
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)

    # region data
    tt.tick("generating data")
    # dss, D = gen_sort_data(seqlen=seqlen, numvoc=numvoc, numex=numex, prepend_inp=False)
    dss, nlD, flD, rare_nl, rare_fl = gen_datasets(which=which)
    tloader, vloader, xloader = [torch.utils.data.DataLoader(ds, batch_size=batsize, shuffle=True) for ds in dss]
    seqlen = len(dss[0][0][1])
    # merge nlD into flD and make mapper
    nextflDid = max(flD.values()) + 1
    sourcemap = torch.zeros(len(nlD), dtype=torch.long, device=device)
    for k, v in nlD.items():
        if k not in flD:
            flD[k] = nextflDid
            nextflDid += 1
        sourcemap[v] = flD[k]
    tt.tock("data generated")
    # endregion

    # region model
    tt.tick("building model")
    # source side
    inpemb = q.UnkReplWordEmb(embdim, worddic=nlD, unk_tokens=rare_nl)
    encdims = [encdim] * numlayer
    encoder = q.LSTMEncoder(embdim, *encdims, bidir=True, dropout_in_shared=dropout)

    # target side
    decemb = q.UnkReplWordEmb(embdim, worddic=flD, unk_tokens=rare_fl)
    decinpdim = embdim
    decdims = [decinpdim] + [encdim] * numlayer
    dec_core = torch.nn.Sequential(
        *[q.rnn.LSTMCell(decdims[i-1], decdims[i], dropout_in=dropout) for i in range(1, len(decdims))]
    )
    att = attention.FwdAttention(decdims[-1], encdim * 2, decdims[-1])
    out = torch.nn.Sequential(
        q.UnkReplWordLinout(decdims[-1]+encdim*2, worddic=flD, unk_tokens=rare_fl),
        # torch.nn.Softmax(-1)
    )
    if selfptr:
        outgate = PointerGeneratorOutGate(decdims[-1] + encdim * 2, encdim, 3)
        out = SelfPointerGeneratorOut(out, sourcemap=sourcemap, gate=outgate)
        selfatt = attention.FwdAttention(decdims[-1], decdims[-1], decdims[-1])
        deccell = SelfPointerGeneratorCell(emb=decemb, core=dec_core, att=att, selfatt=selfatt, out=out)
    else:
        outgate = PointerGeneratorOutGate(decdims[-1] + encdim * 2, encdim, 0)
        out = PointerGeneratorOut(out, sourcemap=sourcemap, gate=outgate)
        deccell = PointerGeneratorCell(emb=decemb, core=dec_core, att=att, out=out)
    train_dec = q.TFDecoder(deccell)
    test_dec = q.FreeDecoder(deccell, maxtime=seqlen+10)
    train_encdec = EncDec(inpemb, encoder, train_dec)
    test_encdec = Test_EncDec(inpemb, encoder, test_dec)

    train_encdec.to(device)
    test_encdec.to(device)
    tt.tock("built model")
    # endregion

    # region training
    # losses:
    if smoothing == 0:
        ce = q.loss.CELoss(mode="probs", ignore_index=0)
    elif goldsmoothing < 0.:
        ce = q.loss.SmoothedCELoss(mode="probs", ignore_index=0, smoothing=smoothing)
    else:
        ce = q.loss.DiffSmoothedCELoss(mode="probs", ignore_index=0, alpha=goldsmoothing, beta=smoothing)
    acc = q.loss.SeqAccuracy(ignore_index=0)
    elemacc = q.loss.SeqElemAccuracy(ignore_index=0)
    trainmodel = TrainModel(train_encdec, [ce, elemacc, acc])
    treeacc = TreeAccuracyPrologPar(flD=flD)
    # optim
    optim = torch.optim.Adam(train_encdec.parameters(), lr=lr, weight_decay=wreg)
    clipgradnorm = lambda: torch.nn.utils.clip_grad_value_(train_encdec.parameters(), clip_value=gradclip)
    # lööps
    batchloop = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainloop = partial(q.train_epoch, model=train_encdec, dataloader=tloader, optim=optim, device=device,
                        losses=[q.LossWrapper(ce), q.LossWrapper(elemacc), q.LossWrapper(acc)],
                        print_every_batch=False, _train_batch=batchloop)
    validloop = partial(q.test_epoch, model=test_encdec, dataloader=vloader, device=device,
                        losses=[q.LossWrapper(treeacc)],
                        print_every_batch=False)

    tt.tick("training")
    q.run_training(trainloop, validloop, max_epochs=epochs)
    tt.tock("trained")

    tt.tick("testing")
    test_results = validloop(model=test_encdec, dataloader=xloader)
    print("Test results (freerunning): {}".format(test_results))
    test_results = validloop(model=train_encdec, dataloader=xloader)
    print("Test results (TF): {}".format(test_results))
    tt.tock("tested")
    # endregion
    tt.msg("done")
Esempio n. 7
0
def run_seq2seq_(
    lr=0.001,
    batsize=32,
    evalbatsize=256,
    epochs=100,
    warmup=5,
    embdim=50,
    encdim=100,
    numlayers=2,
    dropout=.0,
    wreg=1e-6,
    cuda=False,
    gpu=0,
):
    settings = locals().copy()
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt = q.ticktock("script")
    tt.msg("running seq2seq on LC-QuAD")

    tt.tick("loading data")
    xsm, ysm, teststart, tok2act = load_data()
    _tok2act = {ysm.RD[k]: v for k, v in tok2act.items()}

    print("Some examples:")
    for i in range(5):
        print(
            f"{xsm[i]}\n ->{ysm[i]}\n -> {Node.from_transitions(' '.join(ysm[i].split()[1:]), _tok2act)}"
        )

    print("Non-leaf tokens:")
    print({ysm.RD[k]: v for k, v in tok2act.items() if v > 0})

    devstart = teststart - 500
    trainds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[:devstart]).long(),
        torch.tensor(ysm.matrix[:devstart, :-1]).long(),
        torch.tensor(ysm.matrix[:devstart, 1:]).long())
    valds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[devstart:teststart]).long(),
        torch.tensor(ysm.matrix[devstart:teststart, :-1]).long(),
        torch.tensor(ysm.matrix[devstart:teststart, 1:]).long())
    testds = torch.utils.data.TensorDataset(
        torch.tensor(xsm.matrix[teststart:]).long(),
        torch.tensor(ysm.matrix[teststart:, :-1]).long(),
        torch.tensor(ysm.matrix[teststart:, 1:]).long())
    tt.msg(
        f"Data splits: train: {len(trainds)}, valid: {len(valds)}, test: {len(testds)}"
    )

    tloader = torch.utils.data.DataLoader(trainds,
                                          batch_size=batsize,
                                          shuffle=True)
    vloader = torch.utils.data.DataLoader(valds,
                                          batch_size=evalbatsize,
                                          shuffle=False)
    xloader = torch.utils.data.DataLoader(testds,
                                          batch_size=evalbatsize,
                                          shuffle=False)
    tt.tock("data loaded")

    # model
    enclayers, declayers = numlayers, numlayers
    decdim = encdim
    xemb = q.WordEmb(embdim, worddic=xsm.D)
    yemb = q.WordEmb(embdim, worddic=ysm.D)
    encdims = [embdim] + [encdim // 2] * enclayers
    xenc = q.LSTMEncoder(embdim,
                         *encdims[1:],
                         bidir=True,
                         dropout_in_shared=dropout)
    decdims = [embdim] + [decdim] * declayers
    dec_core = torch.nn.Sequential(*[
        q.LSTMCell(decdims[i - 1],
                   decdims[i],
                   dropout_in=dropout,
                   dropout_rec=dropout) for i in range(1, len(decdims))
    ])
    yout = q.WordLinout(encdim + decdim, worddic=ysm.D)
    dec_cell = semparse.rnn.LuongCell(emb=yemb,
                                      core=dec_core,
                                      out=yout,
                                      dropout=dropout)
    decoder = q.TFDecoder(dec_cell)
    testdecoder = q.FreeDecoder(dec_cell, maxtime=100)

    m = Seq2Seq(xemb, xenc, decoder)
    testm = Seq2Seq(xemb, xenc, testdecoder, test=True)

    # test model
    tt.tick("running a batch")
    test_y = m(*iter(tloader).next()[:-1])
    q.batch_reset(m)
    test_y = testm(*iter(vloader).next()[:-1])
    q.batch_reset(m)
    tt.tock(f"ran a batch: {test_y.size()}")

    optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg)
    tlosses = [
        q.CELoss(mode="logits", ignore_index=0),
        q.Accuracy(ignore_index=0),
        q.SeqAccuracy(ignore_index=0)
    ]
    xlosses = [
        q.CELoss(mode="logits", ignore_index=0),
        q.Accuracy(ignore_index=0),
        q.SeqAccuracy(ignore_index=0)
    ]
    tlosses = [q.LossWrapper(l) for l in tlosses]
    vlosses = [q.LossWrapper(l) for l in xlosses]
    xlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=m,
                        dataloader=tloader,
                        optim=optim,
                        losses=tlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=testm,
                      dataloader=vloader,
                      losses=vlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=testm,
                       dataloader=xloader,
                       losses=xlosses,
                       device=device)

    lrplateau = q.util.ReduceLROnPlateau(optim,
                                         mode="max",
                                         factor=.1,
                                         patience=3,
                                         cooldown=1,
                                         warmup=warmup,
                                         threshold=0.,
                                         verbose=True,
                                         eps=1e-9)
    on_after_valid = [lambda: lrplateau.step(vlosses[1].get_epoch_error())]
    _devloop = partial(devloop, on_end=on_after_valid)
    stoptrain = [lambda: all([pg["lr"] <= 1e-7 for pg in optim.param_groups])]

    tt.tick("training")
    q.run_training(trainloop,
                   _devloop,
                   max_epochs=epochs,
                   check_stop=stoptrain)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    settings["testres"] = testres
    tt.tock("tested")

    devres = devloop()
    print(devres, vlosses[0].get_epoch_error())

    return vlosses[1].get_epoch_error()