def create_model(embdim=100, hdim=100, dropout=0., numlayers: int = 1, sentence_encoder: SentenceEncoder = None, query_encoder: SentenceEncoder = None, smoothing: float = 0., feedatt=False): inpemb = torch.nn.Embedding(sentence_encoder.vocab.number_of_ids(), embdim, padding_idx=0) inpemb = TokenEmb(inpemb, rare_token_ids=sentence_encoder.vocab.rare_ids, rare_id=1) encoder_dim = hdim encoder = q.LSTMEncoder(embdim, *([encoder_dim // 2] * numlayers), bidir=True, dropout_in=dropout) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(embdim, hdim, num_layers=numlayers, bidirectional=True, batch_first=True, # dropout=dropout)) decoder_emb = torch.nn.Embedding(query_encoder.vocab.number_of_ids(), embdim, padding_idx=0) decoder_emb = TokenEmb(decoder_emb, rare_token_ids=query_encoder.vocab.rare_ids, rare_id=1) dec_rnn_in_dim = embdim + (encoder_dim if feedatt else 0) decoder_rnn = [torch.nn.LSTMCell(dec_rnn_in_dim, hdim)] for i in range(numlayers - 1): decoder_rnn.append(torch.nn.LSTMCell(hdim, hdim)) decoder_rnn = LSTMCellTransition(*decoder_rnn, dropout=dropout) decoder_out = BasicGenOutput(hdim + encoder_dim, sentence_encoder.vocab, query_encoder.vocab) attention = q.Attention(q.MatMulDotAttComp(hdim, encoder_dim)) enctodec = torch.nn.Sequential(torch.nn.Linear(encoder_dim, hdim), torch.nn.Tanh()) model = BasicPtrGenModel(inpemb, encoder, decoder_emb, decoder_rnn, decoder_out, attention, dropout=dropout, enc_to_dec=enctodec, feedatt=feedatt) dec = TFTokenSeqDecoder(model, smoothing=smoothing) return dec
def __init__(self, qrydim=None, ctxdim=None, encdim=None, dropout=0., numlayers=1, **kw): super(SigmoidLSTMAttComp, self).__init__(**kw) encdims = [encdim] * numlayers self.layers = q.LSTMEncoder(qrydim + ctxdim, *encdims, bidir=False, dropout_in=dropout) self.lin = torch.nn.Linear(encdim, 1) self.act = torch.nn.Sigmoid()
def __init__(self, embdim, hdim, numlayers: int = 1, dropout=0., sentence_encoder: SequenceEncoder = None, query_encoder: SequenceEncoder = None, feedatt=False, store_attn=True, **kw): super(BasicGenModel, self).__init__(**kw) inpemb = torch.nn.Embedding(sentence_encoder.vocab.number_of_ids(), 300, padding_idx=0) inpemb = TokenEmb(inpemb, adapt_dims=(300, embdim), rare_token_ids=sentence_encoder.vocab.rare_ids, rare_id=1) _, covered_word_ids = load_pretrained_embeddings( inpemb.emb, sentence_encoder.vocab.D, p="../../data/glove/glove300uncased" ) # load glove embeddings where possible into the inner embedding class inpemb._do_rare(inpemb.rare_token_ids - covered_word_ids) self.inp_emb = inpemb encoder_dim = hdim encoder = q.LSTMEncoder(embdim, *([encoder_dim // 2] * numlayers), bidir=True, dropout_in=dropout) self.inp_enc = encoder decoder_emb = torch.nn.Embedding(query_encoder.vocab.number_of_ids(), embdim, padding_idx=0) decoder_emb = TokenEmb(decoder_emb, rare_token_ids=query_encoder.vocab.rare_ids, rare_id=1) self.out_emb = decoder_emb dec_rnn_in_dim = embdim + (encoder_dim if feedatt else 0) decoder_rnn = [torch.nn.LSTMCell(dec_rnn_in_dim, hdim)] for i in range(numlayers - 1): decoder_rnn.append(torch.nn.LSTMCell(hdim, hdim)) decoder_rnn = LSTMCellTransition(*decoder_rnn, dropout=dropout) self.out_rnn = decoder_rnn decoder_out = BasicGenOutput(hdim + encoder_dim, vocab=query_encoder.vocab) # decoder_out.build_copy_maps(inp_vocab=sentence_encoder.vocab) self.out_lin = decoder_out self.att = q.Attention(q.MatMulDotAttComp(hdim, encoder_dim)) self.enc_to_dec = torch.nn.Sequential( torch.nn.Linear(encoder_dim, hdim), torch.nn.Tanh()) self.feedatt = feedatt self.nocopy = True self.store_attn = store_attn
def run_rerank( lr=0.001, batsize=20, epochs=1, embdim=301, # not used encdim=200, numlayers=1, beamsize=5, dropout=.2, wreg=1e-10, cuda=False, gpu=0, minfreq=2, gradnorm=3., cosine_restarts=1., domain="restaurants", gensavedp="overnight_basic/run{}", genrunid=1, ): localargs = locals().copy() print(locals()) gensavedrunp = gensavedp.format(genrunid) tt = q.ticktock("script") device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt.tick("loading data") ds = q.load_dataset(gensavedrunp) # ds = OvernightDataset(domain=domain, sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer), min_freq=minfreq) print( f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)") tt.tock("data loaded") do_rare_stats(ds) # batch = next(iter(train_dl)) # print(batch) # print("input graph") # print(batch.batched_states) genmodel, genargs = q.load_run(gensavedrunp) # BasicGenModel(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers, # sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True) # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids]) # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens) inpenc = q.LSTMEncoder(embdim, *([encdim // 2] * numlayers), bidir=True, dropout_in=dropout) outenc = q.LSTMEncoder(embdim, *([encdim // 2] * numlayers), bidir=True, dropout_in=dropout) scoremodel = SimpleScoreModel(genmodel.inp_emb, genmodel.out_emb, LSTMEncoderWrapper(inpenc), LSTMEncoderWrapper(outenc), DotSimilarity()) model = BeamReranker(genmodel, scoremodel, beamsize=beamsize, maxtime=50) # todo: run over whole dataset to populate beam cache testbatch = next(iter(ds.dataloader("train", batsize=2))) model(testbatch) sys.exit() tfdecoder = SeqDecoder(TFTransition(model), [ CELoss(ignore_index=0, mode="logprobs"), SeqAccuracies(), TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"op:and", "SW:concat"}) ]) # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50) freedecoder = BeamDecoder( model, maxtime=50, beamsize=beamsize, eval_beam=[ TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"op:and", "SW:concat"}) ]) # # test # tt.tick("doing one epoch") # for batch in iter(train_dl): # batch = batch.to(device) # ttt.tick("start batch") # # with torch.no_grad(): # out = tfdecoder(batch) # ttt.tock("end batch") # tt.tock("done one epoch") # print(out) # sys.exit() # beamdecoder(next(iter(train_dl))) # print(dict(tfdecoder.named_parameters()).keys()) losses = make_array_of_metrics("loss", "seq_acc", "tree_acc") vlosses = make_array_of_metrics("tree_acc", "tree_acc_at3", "tree_acc_at_last") trainable_params = tfdecoder.named_parameters() exclude_params = {"model.model.inp_emb.emb.weight" } # don't train input embeddings if doing glove trainable_params = [ v for k, v in trainable_params if k not in exclude_params ] # 4. define optim optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg) # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # lr schedule if cosine_restarts >= 0: # t_max = epochs * len(train_dl) t_max = epochs print(f"Total number of updates: {t_max}") lr_schedule = q.WarmupCosineWithHardRestartsSchedule( optim, 0, t_max, cycles=cosine_restarts) reduce_lr = [lambda: lr_schedule.step()] else: reduce_lr = [] # 6. define training function clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_( tfdecoder.parameters(), gradnorm) trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainepoch = partial(q.train_epoch, model=tfdecoder, dataloader=ds.dataloader("train", batsize), optim=optim, losses=losses, _train_batch=trainbatch, device=device, on_end=reduce_lr) # 7. define validation function (using partial) validepoch = partial(q.test_epoch, model=freedecoder, dataloader=ds.dataloader("valid", batsize), losses=vlosses, device=device) # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device) # p = q.save_run(freedecoder, localargs, filepath=__file__) # q.save_dataset(ds, p) # _freedecoder, _localargs = q.load_run(p) # _ds = q.load_dataset(p) # sys.exit() # 7. run training tt.tick("training") q.run_training(run_train_epoch=trainepoch, run_valid_epoch=validepoch, max_epochs=epochs) tt.tock("done training") # testing tt.tick("testing") testresults = q.test_epoch(model=freedecoder, dataloader=ds.dataloader("test", batsize), losses=vlosses, device=device) print(testresults) tt.tock("tested") # save model? tosave = input( "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>" ) if tosave.lower() == "y" or tosave.lower() == "yes" or re.match( "\d+", tosave.lower()): overwrite = int(tosave) if re.match("\d+", tosave) else None p = q.save_run(model, localargs, filepath=__file__, overwrite=overwrite) q.save_dataset(ds, p) _model, _localargs = q.load_run(p) _ds = q.load_dataset(p) _freedecoder = BeamDecoder( _model, maxtime=50, beamsize=beamsize, eval_beam=[ TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"op:and", "SW:concat"}) ]) # testing tt.tick("testing reloaded") _testresults = q.test_epoch(model=_freedecoder, dataloader=_ds.dataloader("test", batsize), losses=vlosses, device=device) print(_testresults) assert (testresults == _testresults) tt.tock("tested")
def run_gatedtree( lr=0.01, gradclip=5., batsize=20, epochs=80, embdim=200, encdim=200, numlayer=1, cuda=False, gpu=0, wreg=1e-8, dropout=0.5, smoothing=0.4, goldsmoothing=-0.1, which="geo", relatt=False, ): tt = q.ticktock("script") tt.msg("running gated tree decoder") device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) # region data tt.tick("generating data") # dss, D = gen_sort_data(seqlen=seqlen, numvoc=numvoc, numex=numex, prepend_inp=False) dss, nlD, flD = gen_datasets(which=which) tloader, vloader, xloader = [ torch.utils.data.DataLoader(ds, batch_size=batsize, shuffle=True) for ds in dss ] seqlen = len(dss[0][0][1]) id2pushpop = torch.zeros(len(flD), dtype=torch.long, device=device) id2pushpop[flD["("]] = +1 id2pushpop[flD[")"]] = -1 tt.tock("data generated") # endregion # region model tt.tick("building model") # source side inpemb = q.WordEmb(embdim, worddic=nlD) encdims = [encdim] * numlayer encoder = q.LSTMEncoder(embdim, *encdims, bidir=False, dropout_in_shared=dropout) # target side decemb = q.WordEmb(embdim, worddic=flD) decinpdim = embdim decdims = [decinpdim] + [encdim] * numlayer dec_core = \ [GatedTreeLSTMCell(decdims[i-1], decdims[i], dropout_in=dropout) for i in range(1, len(decdims))] ### dec_core = TreeRNNDecoderCellCore(*dec_core) if relatt: att = ComboAbsRelAttention(ctxdim=encdim, vecdim=encdim) else: att = BasicAttention() out = torch.nn.Sequential(q.WordLinout(encdim, worddic=flD), # torch.nn.Softmax(-1) ) merge = q.rnn.FwdDecCellMerge(decdims[-1], encdims[-1], outdim=encdim) deccell = TreeRNNDecoderCell(emb=decemb, core=dec_core, att=att, out=out, merge=merge, id2pushpop=id2pushpop) train_dec = q.TFDecoder(deccell) test_dec = q.FreeDecoder(deccell, maxtime=seqlen + 10) train_encdec = EncDec(inpemb, encoder, train_dec) test_encdec = Test_EncDec(inpemb, encoder, test_dec) train_encdec.to(device) test_encdec.to(device) tt.tock("built model") # endregion # region training # losses: if smoothing == 0: ce = q.loss.CELoss(mode="logits", ignore_index=0) elif goldsmoothing < 0.: ce = q.loss.SmoothedCELoss(mode="logits", ignore_index=0, smoothing=smoothing) else: ce = q.loss.DiffSmoothedCELoss(mode="logits", ignore_index=0, alpha=goldsmoothing, beta=smoothing) acc = q.loss.SeqAccuracy(ignore_index=0) elemacc = q.loss.SeqElemAccuracy(ignore_index=0) treeacc = TreeAccuracyLambdaDFPar(flD=flD) # optim optim = torch.optim.RMSprop(train_encdec.parameters(), lr=lr, alpha=0.95, weight_decay=wreg) clipgradnorm = lambda: torch.nn.utils.clip_grad_value_( train_encdec.parameters(), clip_value=gradclip) # lööps batchloop = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainloop = partial( q.train_epoch, model=train_encdec, dataloader=tloader, optim=optim, device=device, losses=[q.LossWrapper(ce), q.LossWrapper(elemacc), q.LossWrapper(acc)], print_every_batch=False, _train_batch=batchloop) validloop = partial(q.test_epoch, model=test_encdec, dataloader=vloader, device=device, losses=[q.LossWrapper(treeacc)], print_every_batch=False) tt.tick("training") q.run_training(trainloop, validloop, max_epochs=epochs) tt.tock("trained") tt.tick("testing") test_results = validloop(model=test_encdec, dataloader=xloader) print("Test results (freerunning): {}".format(test_results)) test_results = validloop(model=train_encdec, dataloader=xloader) print("Test results (TF): {}".format(test_results)) tt.tock("tested") # endregion tt.msg("done")
def run_normal(lr=0.001, gradclip=5., batsize=20, epochs=150, embdim=100, encdim=200, numlayer=1, cuda=False, gpu=0, wreg=1e-8, dropout=0.5, smoothing=0., goldsmoothing=-0.1, selfptr=False, which="geo"): tt = q.ticktock("script") tt.msg("running normal att") device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) # region data tt.tick("generating data") # dss, D = gen_sort_data(seqlen=seqlen, numvoc=numvoc, numex=numex, prepend_inp=False) dss, nlD, flD, rare_nl, rare_fl = gen_datasets(which=which) tloader, vloader, xloader = [torch.utils.data.DataLoader(ds, batch_size=batsize, shuffle=True) for ds in dss] seqlen = len(dss[0][0][1]) # merge nlD into flD and make mapper nextflDid = max(flD.values()) + 1 sourcemap = torch.zeros(len(nlD), dtype=torch.long, device=device) for k, v in nlD.items(): if k not in flD: flD[k] = nextflDid nextflDid += 1 sourcemap[v] = flD[k] tt.tock("data generated") # endregion # region model tt.tick("building model") # source side inpemb = q.UnkReplWordEmb(embdim, worddic=nlD, unk_tokens=rare_nl) encdims = [encdim] * numlayer encoder = q.LSTMEncoder(embdim, *encdims, bidir=True, dropout_in_shared=dropout) # target side decemb = q.UnkReplWordEmb(embdim, worddic=flD, unk_tokens=rare_fl) decinpdim = embdim decdims = [decinpdim] + [encdim] * numlayer dec_core = torch.nn.Sequential( *[q.rnn.LSTMCell(decdims[i-1], decdims[i], dropout_in=dropout) for i in range(1, len(decdims))] ) att = attention.FwdAttention(decdims[-1], encdim * 2, decdims[-1]) out = torch.nn.Sequential( q.UnkReplWordLinout(decdims[-1]+encdim*2, worddic=flD, unk_tokens=rare_fl), # torch.nn.Softmax(-1) ) if selfptr: outgate = PointerGeneratorOutGate(decdims[-1] + encdim * 2, encdim, 3) out = SelfPointerGeneratorOut(out, sourcemap=sourcemap, gate=outgate) selfatt = attention.FwdAttention(decdims[-1], decdims[-1], decdims[-1]) deccell = SelfPointerGeneratorCell(emb=decemb, core=dec_core, att=att, selfatt=selfatt, out=out) else: outgate = PointerGeneratorOutGate(decdims[-1] + encdim * 2, encdim, 0) out = PointerGeneratorOut(out, sourcemap=sourcemap, gate=outgate) deccell = PointerGeneratorCell(emb=decemb, core=dec_core, att=att, out=out) train_dec = q.TFDecoder(deccell) test_dec = q.FreeDecoder(deccell, maxtime=seqlen+10) train_encdec = EncDec(inpemb, encoder, train_dec) test_encdec = Test_EncDec(inpemb, encoder, test_dec) train_encdec.to(device) test_encdec.to(device) tt.tock("built model") # endregion # region training # losses: if smoothing == 0: ce = q.loss.CELoss(mode="probs", ignore_index=0) elif goldsmoothing < 0.: ce = q.loss.SmoothedCELoss(mode="probs", ignore_index=0, smoothing=smoothing) else: ce = q.loss.DiffSmoothedCELoss(mode="probs", ignore_index=0, alpha=goldsmoothing, beta=smoothing) acc = q.loss.SeqAccuracy(ignore_index=0) elemacc = q.loss.SeqElemAccuracy(ignore_index=0) trainmodel = TrainModel(train_encdec, [ce, elemacc, acc]) treeacc = TreeAccuracyPrologPar(flD=flD) # optim optim = torch.optim.Adam(train_encdec.parameters(), lr=lr, weight_decay=wreg) clipgradnorm = lambda: torch.nn.utils.clip_grad_value_(train_encdec.parameters(), clip_value=gradclip) # lööps batchloop = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainloop = partial(q.train_epoch, model=train_encdec, dataloader=tloader, optim=optim, device=device, losses=[q.LossWrapper(ce), q.LossWrapper(elemacc), q.LossWrapper(acc)], print_every_batch=False, _train_batch=batchloop) validloop = partial(q.test_epoch, model=test_encdec, dataloader=vloader, device=device, losses=[q.LossWrapper(treeacc)], print_every_batch=False) tt.tick("training") q.run_training(trainloop, validloop, max_epochs=epochs) tt.tock("trained") tt.tick("testing") test_results = validloop(model=test_encdec, dataloader=xloader) print("Test results (freerunning): {}".format(test_results)) test_results = validloop(model=train_encdec, dataloader=xloader) print("Test results (TF): {}".format(test_results)) tt.tock("tested") # endregion tt.msg("done")
def run_seq2seq_( lr=0.001, batsize=32, evalbatsize=256, epochs=100, warmup=5, embdim=50, encdim=100, numlayers=2, dropout=.0, wreg=1e-6, cuda=False, gpu=0, ): settings = locals().copy() device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt = q.ticktock("script") tt.msg("running seq2seq on LC-QuAD") tt.tick("loading data") xsm, ysm, teststart, tok2act = load_data() _tok2act = {ysm.RD[k]: v for k, v in tok2act.items()} print("Some examples:") for i in range(5): print( f"{xsm[i]}\n ->{ysm[i]}\n -> {Node.from_transitions(' '.join(ysm[i].split()[1:]), _tok2act)}" ) print("Non-leaf tokens:") print({ysm.RD[k]: v for k, v in tok2act.items() if v > 0}) devstart = teststart - 500 trainds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[:devstart]).long(), torch.tensor(ysm.matrix[:devstart, :-1]).long(), torch.tensor(ysm.matrix[:devstart, 1:]).long()) valds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[devstart:teststart]).long(), torch.tensor(ysm.matrix[devstart:teststart, :-1]).long(), torch.tensor(ysm.matrix[devstart:teststart, 1:]).long()) testds = torch.utils.data.TensorDataset( torch.tensor(xsm.matrix[teststart:]).long(), torch.tensor(ysm.matrix[teststart:, :-1]).long(), torch.tensor(ysm.matrix[teststart:, 1:]).long()) tt.msg( f"Data splits: train: {len(trainds)}, valid: {len(valds)}, test: {len(testds)}" ) tloader = torch.utils.data.DataLoader(trainds, batch_size=batsize, shuffle=True) vloader = torch.utils.data.DataLoader(valds, batch_size=evalbatsize, shuffle=False) xloader = torch.utils.data.DataLoader(testds, batch_size=evalbatsize, shuffle=False) tt.tock("data loaded") # model enclayers, declayers = numlayers, numlayers decdim = encdim xemb = q.WordEmb(embdim, worddic=xsm.D) yemb = q.WordEmb(embdim, worddic=ysm.D) encdims = [embdim] + [encdim // 2] * enclayers xenc = q.LSTMEncoder(embdim, *encdims[1:], bidir=True, dropout_in_shared=dropout) decdims = [embdim] + [decdim] * declayers dec_core = torch.nn.Sequential(*[ q.LSTMCell(decdims[i - 1], decdims[i], dropout_in=dropout, dropout_rec=dropout) for i in range(1, len(decdims)) ]) yout = q.WordLinout(encdim + decdim, worddic=ysm.D) dec_cell = semparse.rnn.LuongCell(emb=yemb, core=dec_core, out=yout, dropout=dropout) decoder = q.TFDecoder(dec_cell) testdecoder = q.FreeDecoder(dec_cell, maxtime=100) m = Seq2Seq(xemb, xenc, decoder) testm = Seq2Seq(xemb, xenc, testdecoder, test=True) # test model tt.tick("running a batch") test_y = m(*iter(tloader).next()[:-1]) q.batch_reset(m) test_y = testm(*iter(vloader).next()[:-1]) q.batch_reset(m) tt.tock(f"ran a batch: {test_y.size()}") optim = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=wreg) tlosses = [ q.CELoss(mode="logits", ignore_index=0), q.Accuracy(ignore_index=0), q.SeqAccuracy(ignore_index=0) ] xlosses = [ q.CELoss(mode="logits", ignore_index=0), q.Accuracy(ignore_index=0), q.SeqAccuracy(ignore_index=0) ] tlosses = [q.LossWrapper(l) for l in tlosses] vlosses = [q.LossWrapper(l) for l in xlosses] xlosses = [q.LossWrapper(l) for l in xlosses] trainloop = partial(q.train_epoch, model=m, dataloader=tloader, optim=optim, losses=tlosses, device=device) devloop = partial(q.test_epoch, model=testm, dataloader=vloader, losses=vlosses, device=device) testloop = partial(q.test_epoch, model=testm, dataloader=xloader, losses=xlosses, device=device) lrplateau = q.util.ReduceLROnPlateau(optim, mode="max", factor=.1, patience=3, cooldown=1, warmup=warmup, threshold=0., verbose=True, eps=1e-9) on_after_valid = [lambda: lrplateau.step(vlosses[1].get_epoch_error())] _devloop = partial(devloop, on_end=on_after_valid) stoptrain = [lambda: all([pg["lr"] <= 1e-7 for pg in optim.param_groups])] tt.tick("training") q.run_training(trainloop, _devloop, max_epochs=epochs, check_stop=stoptrain) tt.tock("done training") tt.tick("testing") testres = testloop() print(testres) settings["testres"] = testres tt.tock("tested") devres = devloop() print(devres, vlosses[0].get_epoch_error()) return vlosses[1].get_epoch_error()