Example #1
0
def run(
    lr=0.001,
    batsize=20,
    epochs=60,
    embdim=128,
    encdim=256,
    numlayers=1,
    beamsize=5,
    dropout=.25,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=2,
    gradnorm=3.,
    smoothing=0.1,
    cosine_restarts=1.,
    seed=123456,
    numcvfolds=6,
    testfold=-1,  # if non-default, must be within number of splits, the chosen value is used for validation
    reorder_random=False,
):
    localargs = locals().copy()
    print(locals())
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    cvfolds = None if testfold == -1 else numcvfolds
    testfold = None if testfold == -1 else testfold
    ds = GeoDataset(
        sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer),
        min_freq=minfreq,
        cvfolds=cvfolds,
        testfold=testfold,
        reorder_random=reorder_random)
    print(
        f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)")
    tt.tock("data loaded")

    do_rare_stats(ds)
    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    model = BasicGenModel(embdim=embdim,
                          hdim=encdim,
                          dropout=dropout,
                          numlayers=numlayers,
                          sentence_encoder=ds.sentence_encoder,
                          query_encoder=ds.query_encoder,
                          feedatt=True)

    # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids])
    # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens)

    tfdecoder = SeqDecoder(model,
                           tf_ratio=1.,
                           eval=[
                               CELoss(ignore_index=0,
                                      mode="logprobs",
                                      smoothing=smoothing),
                               SeqAccuracies(),
                               TreeAccuracy(tensor2tree=partial(
                                   tensor2tree, D=ds.query_encoder.vocab),
                                            orderless={"and"})
                           ])
    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")

    freedecoder = SeqDecoder(model,
                             maxtime=100,
                             tf_ratio=0.,
                             eval=[
                                 SeqAccuracies(),
                                 TreeAccuracy(tensor2tree=partial(
                                     tensor2tree, D=ds.query_encoder.vocab),
                                              orderless={"and"})
                             ])
    vlosses = make_array_of_metrics("seq_acc", "tree_acc")

    beamdecoder = BeamDecoder(model,
                              maxtime=100,
                              beamsize=beamsize,
                              copy_deep=True,
                              eval=[SeqAccuracies()],
                              eval_beam=[
                                  TreeAccuracy(tensor2tree=partial(
                                      tensor2tree, D=ds.query_encoder.vocab),
                                               orderless={"and"})
                              ])
    beamlosses = make_array_of_metrics("seq_acc", "tree_acc",
                                       "tree_acc_at_last")

    # 4. define optim
    # optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg)
    optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max}")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        tfdecoder.parameters(), gradnorm)
    # clipgradnorm = lambda: None
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])

    train_on = "train"
    valid_on = "test" if testfold is None else "valid"
    trainepoch = partial(q.train_epoch,
                         model=tfdecoder,
                         dataloader=ds.dataloader(train_on,
                                                  batsize,
                                                  shuffle=True),
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=freedecoder,
                         dataloader=ds.dataloader(valid_on,
                                                  batsize,
                                                  shuffle=False),
                         losses=vlosses,
                         device=device)
    # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device)

    # p = q.save_run(freedecoder, localargs, filepath=__file__)
    # q.save_dataset(ds, p)
    # _freedecoder, _localargs = q.load_run(p)
    # _ds = q.load_dataset(p)
    # sys.exit()

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")

    if testfold is not None:
        return vlosses[1].get_epoch_error()

    # testing
    tt.tick("testing")
    testresults = q.test_epoch(model=beamdecoder,
                               dataloader=ds.dataloader("test", batsize),
                               losses=beamlosses,
                               device=device)
    print("validation test results: ", testresults)
    tt.tock("tested")
    tt.tick("testing")
    testresults = q.test_epoch(model=beamdecoder,
                               dataloader=ds.dataloader("test", batsize),
                               losses=beamlosses,
                               device=device)
    print("test results: ", testresults)
    tt.tock("tested")

    # save model?
    tosave = input(
        "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>"
    )
    # if True:
    #     overwrite = None
    if tosave.lower() == "y" or tosave.lower() == "yes" or re.match(
            "\d+", tosave.lower()):
        overwrite = int(tosave) if re.match("\d+", tosave) else None
        p = q.save_run(model,
                       localargs,
                       filepath=__file__,
                       overwrite=overwrite)
        q.save_dataset(ds, p)
        _model, _localargs = q.load_run(p)
        _ds = q.load_dataset(p)

        _freedecoder = BeamDecoder(_model,
                                   maxtime=100,
                                   beamsize=beamsize,
                                   copy_deep=True,
                                   eval=[SeqAccuracies()],
                                   eval_beam=[
                                       TreeAccuracy(tensor2tree=partial(
                                           tensor2tree,
                                           D=ds.query_encoder.vocab),
                                                    orderless={"and"})
                                   ])

        # testing
        tt.tick("testing reloaded")
        _testresults = q.test_epoch(model=_freedecoder,
                                    dataloader=_ds.dataloader("test", batsize),
                                    losses=beamlosses,
                                    device=device)
        print(_testresults)
        tt.tock("tested")

        # save predictions
        _, testpreds = q.eval_loop(_freedecoder,
                                   ds.dataloader("test",
                                                 batsize=batsize,
                                                 shuffle=False),
                                   device=device)
        testout = get_outputs_for_save(testpreds)
        _, trainpreds = q.eval_loop(_freedecoder,
                                    ds.dataloader("train",
                                                  batsize=batsize,
                                                  shuffle=False),
                                    device=device)
        trainout = get_outputs_for_save(trainpreds)

        with open(os.path.join(p, "trainpreds.json"), "w") as f:
            ujson.dump(trainout, f)

        with open(os.path.join(p, "testpreds.json"), "w") as f:
            ujson.dump(testout, f)
Example #2
0
def run(
    lr=0.001,
    batsize=20,
    epochs=70,
    embdim=128,
    encdim=400,
    numlayers=1,
    beamsize=5,
    dropout=.5,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=2,
    gradnorm=3.,
    smoothing=0.1,
    cosine_restarts=1.,
    seed=123456,
):
    localargs = locals().copy()
    print(locals())
    torch.manual_seed(seed)
    np.random.seed(seed)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    ds = GeoDatasetRank()
    print(
        f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)")
    tt.tock("data loaded")

    # do_rare_stats(ds)

    # model = TreeRankModel(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers,
    #                          sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder)
    #
    model = ParikhRankModel(embdim=encdim,
                            dropout=dropout,
                            sentence_encoder=ds.sentence_encoder,
                            query_encoder=ds.query_encoder)

    # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids])
    # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens)
    ranker = Ranker(model,
                    eval=[BCELoss(mode="logits", smoothing=smoothing)],
                    evalseq=[
                        SeqAccuracies(),
                        TreeAccuracy(tensor2tree=partial(
                            tensor2tree, D=ds.query_encoder.vocab),
                                     orderless={"and", "or"})
                    ])

    losses = make_array_of_metrics("loss", "seq_acc", "tree_acc")
    vlosses = make_array_of_metrics("seq_acc", "tree_acc")

    # 4. define optim
    # optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg)
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max}")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        model.parameters(), gradnorm)
    # clipgradnorm = lambda: None
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch,
                         model=ranker,
                         dataloader=ds.dataloader("train", batsize),
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=ranker,
                         dataloader=ds.dataloader("test", batsize),
                         losses=vlosses,
                         device=device)

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")

    # testing
    tt.tick("testing")
    testresults = q.test_epoch(model=ranker,
                               dataloader=ds.dataloader("test", batsize),
                               losses=vlosses,
                               device=device)
    print("validation test results: ", testresults)
    tt.tock("tested")
    tt.tick("testing")
    testresults = q.test_epoch(model=ranker,
                               dataloader=ds.dataloader("test", batsize),
                               losses=vlosses,
                               device=device)
    print("test results: ", testresults)
    tt.tock("tested")

    # save model?
    tosave = input(
        "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>"
    )
    # if True:
    #     overwrite = None
    if tosave.lower() == "y" or tosave.lower() == "yes" or re.match(
            "\d+", tosave.lower()):
        overwrite = int(tosave) if re.match("\d+", tosave) else None
        p = q.save_run(model,
                       localargs,
                       filepath=__file__,
                       overwrite=overwrite)
        q.save_dataset(ds, p)
        _model, _localargs = q.load_run(p)
        _ds = q.load_dataset(p)

        _freedecoder = BeamDecoder(
            _model,
            maxtime=100,
            beamsize=beamsize,
            copy_deep=True,
            eval=[SeqAccuracies()],
            eval_beam=[
                TreeAccuracy(tensor2tree=partial(tensor2tree,
                                                 D=ds.query_encoder.vocab),
                             orderless={"op:and", "SW:concat"})
            ])

        # testing
        tt.tick("testing reloaded")
        _testresults = q.test_epoch(model=_freedecoder,
                                    dataloader=_ds.dataloader("test", batsize),
                                    losses=beamlosses,
                                    device=device)
        print(_testresults)
        tt.tock("tested")

        # save predictions
        _, testpreds = q.eval_loop(_freedecoder,
                                   ds.dataloader("test",
                                                 batsize=batsize,
                                                 shuffle=False),
                                   device=device)
        testout = get_outputs_for_save(testpreds)
        _, trainpreds = q.eval_loop(_freedecoder,
                                    ds.dataloader("train",
                                                  batsize=batsize,
                                                  shuffle=False),
                                    device=device)
        trainout = get_outputs_for_save(trainpreds)

        with open(os.path.join(p, "trainpreds.json"), "w") as f:
            ujson.dump(trainout, f)

        with open(os.path.join(p, "testpreds.json"), "w") as f:
            ujson.dump(testout, f)
Example #3
0
def run(lr=0.001,
        batsize=50,
        epochs=100,
        embdim=100,
        encdim=100,
        numlayers=1,
        beamsize=1,
        dropout=.2,
        wreg=1e-10,
        cuda=False,
        gpu=0,
        minfreq=3,
        gradnorm=3.,
        cosine_restarts=1.,
        seed=123456,
        ):
    localargs = locals().copy()
    print(locals())
    torch.manual_seed(seed)
    np.random.seed(seed)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    ds = LCQuaDnoENTDataset(sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer), min_freq=minfreq)
    print(f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)")
    tt.tock("data loaded")

    do_rare_stats(ds)
    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    model = BasicGenModel(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers,
                             sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True)

    # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids])
    # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens)

    tfdecoder = SeqDecoder(model, tf_ratio=1.,
                           eval=[CELoss(ignore_index=0, mode="logprobs"),
                            SeqAccuracies(), TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab),
                                                          orderless={"select", "count", "ask"})])
    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")
    # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50)
    if beamsize == 1:
        freedecoder = SeqDecoder(model, maxtime=40, tf_ratio=0.,
                                 eval=[SeqAccuracies(),
                                       TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab),
                                                    orderless={"select", "count", "ask"})])
        vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    else:

        freedecoder = BeamDecoder(model, maxtime=30, beamsize=beamsize,
                                  eval=[SeqAccuracies()],
                                  eval_beam=[TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab),
                                                    orderless={"select", "count", "ask"})])
        vlosses = make_array_of_metrics("seq_acc", "tree_acc", "tree_acc_at_last")

    # # test
    # tt.tick("doing one epoch")
    # for batch in iter(train_dl):
    #     batch = batch.to(device)
    #     ttt.tick("start batch")
    #     # with torch.no_grad():
    #     out = tfdecoder(batch)
    #     ttt.tock("end batch")
    # tt.tock("done one epoch")
    # print(out)
    # sys.exit()

    # beamdecoder(next(iter(train_dl)))

    # print(dict(tfdecoder.named_parameters()).keys())

    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")
    vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    # if beamsize >= 3:
    #     vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at3", "tree_acc_at_last")
    # else:
    #     vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at_last")

    # trainable_params = tfdecoder.named_parameters()
    # exclude_params = set()
    # exclude_params.add("model.model.inp_emb.emb.weight")   # don't train input embeddings if doing glove
    # trainable_params = [v for k, v in trainable_params if k not in exclude_params]

    # 4. define optim
    # optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg)
    optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max}")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(tfdecoder.parameters(), gradnorm)
    # clipgradnorm = lambda: None
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch, model=tfdecoder, dataloader=ds.dataloader("train", batsize), optim=optim, losses=losses,
                         _train_batch=trainbatch, device=device, on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch, model=freedecoder, dataloader=ds.dataloader("valid", batsize), losses=vlosses, device=device)
    # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device)

    # p = q.save_run(freedecoder, localargs, filepath=__file__)
    # q.save_dataset(ds, p)
    # _freedecoder, _localargs = q.load_run(p)
    # _ds = q.load_dataset(p)
    # sys.exit()

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch, run_valid_epoch=validepoch, max_epochs=epochs)
    tt.tock("done training")

    # testing
    tt.tick("testing")
    testresults = q.test_epoch(model=freedecoder, dataloader=ds.dataloader("valid", batsize), losses=vlosses, device=device)
    print("validation test results: ", testresults)
    tt.tock("tested")
    tt.tick("testing")
    testresults = q.test_epoch(model=freedecoder, dataloader=ds.dataloader("test", batsize), losses=vlosses, device=device)
    print("test results: ", testresults)
    tt.tock("tested")

    # save model?
    tosave = input("Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>")
    if tosave.lower() == "y" or tosave.lower() == "yes" or re.match("\d+", tosave.lower()):
        overwrite = int(tosave) if re.match("\d+", tosave) else None
        p = q.save_run(model, localargs, filepath=__file__, overwrite=overwrite)
        q.save_dataset(ds, p)

        # region reload
        _model, _localargs = q.load_run(p)
        _ds = q.load_dataset(p)

        freedecoder.model = _model

        # testing
        tt.tick("testing reloaded")
        _testresults = q.test_epoch(model=freedecoder, dataloader=_ds.dataloader("test", batsize),
                                    losses=vlosses, device=device)
        print(_testresults)
        assert(testresults == _testresults)
        tt.tock("tested")
        # endregion

        # save predictions
        trainpreds = q.eval_loop(freedecoder, ds.dataloader("train", batsize=batsize, shuffle=False), device=device)
        validpreds = q.eval_loop(freedecoder, ds.dataloader("valid", batsize=batsize, shuffle=False), device=device)
        testpreds = q.eval_loop(freedecoder, ds.dataloader("test", batsize=batsize, shuffle=False), device=device)
        trainpreds = get_arrays_to_save(trainpreds[1])
        validpreds = get_arrays_to_save(validpreds[1])
        testpreds = get_arrays_to_save(testpreds[1])
        for fname, content in [("trainpreds.npz", trainpreds), ("validpreds.npz", validpreds), ("testpreds.npz", testpreds)]:
            np.savez(os.path.join(p, fname), **content)

    return testresults
def run(
    lr=0.001,
    enclrmul=0.1,
    hdim=768,
    numlayers=8,
    numheads=12,
    dropout=0.1,
    wreg=0.,
    batsize=10,
    epochs=100,
    warmup=0,
    sustain=0,
    cosinelr=False,
    gradacc=1,
    gradnorm=100,
    patience=5,
    validinter=3,
    seed=87646464,
    gpu=-1,
    datamode="single",
    decodemode="single",  # "full", "ltr" (left to right), "single", "entropy-single"
    trainonvalid=False,
):
    settings = locals().copy()
    print(json.dumps(settings, indent=4))

    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    device = torch.device("cpu") if gpu < 0 else torch.device(gpu)

    tt = q.ticktock("script")
    tt.tick("loading")
    tds, vds, xds, tds_seq, vds_seq, xds_seq, nltok, flenc, orderless = load_ds(
        "restaurants", mode=datamode, trainonvalid=trainonvalid)
    tt.tock("loaded")

    tdl = DataLoader(tds,
                     batch_size=batsize,
                     shuffle=True,
                     collate_fn=collate_fn)
    vdl = DataLoader(vds,
                     batch_size=batsize,
                     shuffle=False,
                     collate_fn=collate_fn)
    xdl = DataLoader(xds,
                     batch_size=batsize,
                     shuffle=False,
                     collate_fn=collate_fn)

    tdl_seq = DataLoader(tds_seq,
                         batch_size=batsize,
                         shuffle=True,
                         collate_fn=autocollate)
    vdl_seq = DataLoader(vds_seq,
                         batch_size=batsize,
                         shuffle=False,
                         collate_fn=autocollate)
    xdl_seq = DataLoader(xds_seq,
                         batch_size=batsize,
                         shuffle=False,
                         collate_fn=autocollate)

    # model
    tagger = TransformerTagger(hdim, flenc.vocab, numlayers, numheads, dropout)
    tagmodel = TreeInsertionTaggerModel(tagger)
    decodermodel = TreeInsertionDecoder(tagger,
                                        seqenc=flenc,
                                        maxsteps=50,
                                        max_tree_size=30,
                                        mode=decodemode)
    decodermodel = TreeInsertionDecoderTrainModel(decodermodel)

    # batch = next(iter(tdl))
    # out = tagmodel(*batch)

    tmetrics = make_array_of_metrics("loss",
                                     "elemrecall",
                                     "allrecall",
                                     "entropyrecall",
                                     reduction="mean")
    vmetrics = make_array_of_metrics("loss",
                                     "elemrecall",
                                     "allrecall",
                                     "entropyrecall",
                                     reduction="mean")
    tseqmetrics = make_array_of_metrics("treeacc", reduction="mean")
    vseqmetrics = make_array_of_metrics("treeacc", reduction="mean")
    xmetrics = make_array_of_metrics("treeacc", reduction="mean")

    # region parameters
    def get_parameters(m, _lr, _enclrmul):
        bertparams = []
        otherparams = []
        for k, v in m.named_parameters():
            if "bert_model." in k:
                bertparams.append(v)
            else:
                otherparams.append(v)
        if len(bertparams) == 0:
            raise Exception("No encoder parameters found!")
        paramgroups = [{
            "params": bertparams,
            "lr": _lr * _enclrmul
        }, {
            "params": otherparams
        }]
        return paramgroups

    # endregion

    def get_optim(_m, _lr, _enclrmul, _wreg=0):
        paramgroups = get_parameters(_m, _lr=lr, _enclrmul=_enclrmul)
        optim = torch.optim.Adam(paramgroups, lr=lr, weight_decay=_wreg)
        return optim

    def clipgradnorm(_m=None, _norm=None):
        torch.nn.utils.clip_grad_norm_(_m.parameters(), _norm)

    eyt = q.EarlyStopper(vseqmetrics[-1],
                         patience=patience,
                         min_epochs=30,
                         more_is_better=True,
                         remember_f=lambda: deepcopy(tagger))
    # def wandb_logger():
    #     d = {}
    #     for name, loss in zip(["loss", "elem_acc", "seq_acc", "tree_acc"], metrics):
    #         d["train_"+name] = loss.get_epoch_error()
    #     for name, loss in zip(["seq_acc", "tree_acc"], vmetrics):
    #         d["valid_"+name] = loss.get_epoch_error()
    #     wandb.log(d)
    t_max = epochs
    optim = get_optim(tagger, lr, enclrmul, wreg)
    print(f"Total number of updates: {t_max} .")
    if cosinelr:
        lr_schedule = q.sched.Linear(steps=warmup) >> q.sched.Cosine(
            steps=t_max - warmup) >> 0.
    else:
        lr_schedule = q.sched.Linear(steps=warmup) >> 1.
    lr_schedule = q.sched.LRSchedule(optim, lr_schedule)

    trainbatch = partial(
        q.train_batch,
        gradient_accumulation_steps=gradacc,
        on_before_optim_step=[lambda: clipgradnorm(_m=tagger, _norm=gradnorm)])

    trainepoch = partial(q.train_epoch,
                         model=tagmodel,
                         dataloader=tdl,
                         optim=optim,
                         losses=tmetrics,
                         device=device,
                         _train_batch=trainbatch,
                         on_end=[lambda: lr_schedule.step()])

    trainseqepoch = partial(q.test_epoch,
                            model=decodermodel,
                            losses=tseqmetrics,
                            dataloader=tdl_seq,
                            device=device)

    validepoch = partial(q.test_epoch,
                         model=decodermodel,
                         losses=vseqmetrics,
                         dataloader=vdl_seq,
                         device=device,
                         on_end=[lambda: eyt.on_epoch_end()])

    # validepoch()        # TODO: remove this after debugging

    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=[trainseqepoch, validepoch],
                   max_epochs=epochs,
                   check_stop=[lambda: eyt.check_stop()],
                   validinter=validinter)
    tt.tock("done training")

    tt.msg("reloading best")
    if eyt.remembered is not None:
        decodermodel.model.tagger = eyt.remembered
        tagmodel.tagger = eyt.remembered

    tt.tick("running test")
    testepoch = partial(q.test_epoch,
                        model=decodermodel,
                        losses=xmetrics,
                        dataloader=xdl_seq,
                        device=device)
    print(testepoch())
    tt.tock()

    # inspect predictions
    validepoch = partial(q.test_epoch,
                         model=tagmodel,
                         losses=vmetrics,
                         dataloader=vdl,
                         device=device)
    print(validepoch())
    inps, outs = q.eval_loop(tagmodel, vdl, device=device)

    # print(outs)

    doexit = False
    for i in range(len(inps[0])):
        for j in range(len(inps[0][i])):
            ui = input("next? (ENTER for next/anything else to exit)>>>")
            if ui != "":
                doexit = True
                break
            question = " ".join(nltok.convert_ids_to_tokens(inps[0][i][j]))
            out_toks = flenc.vocab.tostr(
                inps[1][i][j].detach().cpu().numpy()).split(" ")

            iscorrect = True

            lines = []
            for k, out_tok in enumerate(out_toks):
                gold_toks_for_k = inps[3][i][j][k].detach().cpu().nonzero()[:,
                                                                            0]
                if len(gold_toks_for_k) > 0:
                    gold_toks_for_k = flenc.vocab.tostr(gold_toks_for_k).split(
                        " ")
                else:
                    gold_toks_for_k = [""]

                isopen = inps[2][i][j][k]
                isopen = isopen.detach().cpu().item()

                pred_tok = outs[1][i][j][k].max(-1)[1].detach().cpu().item()
                pred_tok = flenc.vocab(pred_tok)

                pred_tok_correct = pred_tok in gold_toks_for_k or not isopen
                if not pred_tok_correct:
                    iscorrect = False

                entropy = torch.softmax(outs[1][i][j][k], -1).clamp_min(1e-6)
                entropy = -(entropy * torch.log(entropy)).sum().item()
                lines.append(
                    f"{out_tok:25} [{isopen:1}] >> {f'{pred_tok} ({entropy:.3f})':35} {'!!' if not pred_tok_correct else '  '} [{','.join(gold_toks_for_k) if isopen else ''}]"
                )

            print(f"{question} {'!!WRONG!!' if not iscorrect else ''}")
            for line in lines:
                print(line)

        if doexit:
            break
Example #5
0
def run_span_borders(
    lr=DEFAULT_LR,
    dropout=.3,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    evalbatsize=-1,
    epochs=DEFAULT_EPOCHS,
    smoothing=DEFAULT_SMOOTHING,
    dim=200,
    numlayers=1,
    cuda=False,
    gpu=0,
    savep="exp_bilstm_span_borders_",
    datafrac=1.,
    vanillaemb=False,
    embdim=300,
    sched="cos",
    warmup=0.1,
    cycles=0.5,
):
    settings = locals().copy()
    print(locals())
    if evalbatsize < 0:
        evalbatsize = batsize
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    tt = q.ticktock("script")
    tt.msg("running span border with BiLSTM")
    tt.tick("loading data")
    data = load_data(which="span/borders", datafrac=datafrac)
    trainds, devds, testds = data
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=evalbatsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=evalbatsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:-1])
    evalloader = DataLoader(evalds, batch_size=evalbatsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:-1])
    evalloader_dev = DataLoader(evalds_dev,
                                batch_size=evalbatsize,
                                shuffle=False)
    # endregion

    # region model
    tt.tick("creating model")
    # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert = BertModel.from_pretrained("bert-base-uncased")
    emb = bert.embeddings.word_embeddings
    if vanillaemb:
        tt.msg("using vanilla emb of size {}".format(embdim))
        emb = torch.nn.Embedding(emb.weight.size(0), embdim)
    else:
        embdim = bert.config.hidden_size
    # inpD = tokenizer.vocab
    # q.WordEmb.masktoken = "[PAD]"
    # emb = q.WordEmb(embdim, worddic=inpD)
    bilstm = q.rnn.LSTMEncoder(embdim,
                               *([dim] * numlayers),
                               bidir=True,
                               dropout_in_shared=dropout)
    spandet = BorderSpanDetector(emb, bilstm, dim * 2, dropout=dropout)
    spandet.to(device)
    tt.tock("model created")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    params = spandet.parameters()
    sched = get_schedule(sched,
                         warmup=warmup,
                         t_total=totalsteps,
                         cycles=cycles)
    optim = BertAdam(params, lr=lr, weight_decay=wreg, schedule=sched)
    # optim = torch.optim.Adam(spandet.parameters(), lr=lr, weight_decay=wreg)
    losses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(),
        q.SeqAccuracy()
    ]
    xlosses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(),
        q.SeqAccuracy()
    ]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=spandet,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=spandet,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=spandet,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(spandet, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))

        outlen = trainloader.dataset.tensors[0].size(1)
        spandet.outlen = outlen

        # save test predictions
        testpreds = q.eval_loop(spandet, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.test.npy"), testpreds)
        # save dev predictions
        testpreds = q.eval_loop(spandet, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.dev.npy"), testpreds)
        tt.msg("saved in {}".format(savedir))
        tt.tock("done")
Example #6
0
def run_relations(
    lr=DEFAULT_LR,
    dropout=.3,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    epochs=10,
    smoothing=DEFAULT_SMOOTHING,
    cuda=False,
    gpu=0,
    balanced=False,
    maskentity=False,
    savep="exp_bilstm_rels_",
    test=False,
    datafrac=1.,
    vanillaemb=False,
    gloveemb=True,
    embdim=300,
    dim=300,
    numlayers=2,
    warmup=0.01,
    cycles=0.5,
    sched="cos",
    evalbatsize=-1,
    classweighted=False,
):
    print(locals())
    settings = locals().copy()
    if evalbatsize < 0:
        evalbatsize = batsize
    if test:
        epochs = 0
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    assert (not gloveemb or not vanillaemb)
    tt = q.ticktock("script")
    tt.msg("running relation classifier with BiLSTM")
    tt.tick("loading data")
    data = load_data(which="rel+borders",
                     retrelD=True,
                     datafrac=datafrac,
                     wordlevel=gloveemb,
                     rettokD=True)
    trainds, devds, testds, relD, tokD = data
    if maskentity:
        trainds, devds, testds = replace_entity_span(trainds, devds, testds)
    else:
        trainds, devds, testds = [
            TensorDataset(ds.tensors[0], ds.tensors[2])
            for ds in [trainds, devds, testds]
        ]
    relcounts = torch.zeros(max(relD.values()) + 1)
    trainrelcounts = torch.bincount(trainds.tensors[1])
    relcounts[:len(trainrelcounts)] += trainrelcounts.float()
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=evalbatsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=evalbatsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:1])
    evalloader = DataLoader(evalds, batch_size=evalbatsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:1])
    evalloader_dev = DataLoader(evalds_dev,
                                batch_size=evalbatsize,
                                shuffle=False)

    if test:
        evalloader = DataLoader(TensorDataset(*evalloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
        testloader = DataLoader(TensorDataset(*testloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
    # endregion

    # region model
    tt.tick("making model")
    if vanillaemb:
        bert = BertModel.from_pretrained("bert-base-uncased")
        emb = bert.embeddings.word_embeddings
        tt.msg("using vanilla emb of size {}".format(embdim))
        emb = torch.nn.Embedding(emb.weight.size(0), embdim)
    elif gloveemb:
        emb = q.WordEmb.load_glove("glove.50d", selectD=tokD)
    else:
        bert = BertModel.from_pretrained("bert-base-uncased")
        emb = bert.embeddings.word_embeddings
        embdim = bert.config.hidden_size
    bilstm = q.rnn.LSTMEncoder(embdim,
                               *([dim] * numlayers),
                               bidir=True,
                               dropout_in=dropout)
    # bilstm = torch.nn.LSTM(embdim, dim, batch_first=True, num_layers=numlayers, bidirectional=True, dropout=dropout)
    m = RelationClassifier(emb=emb,
                           bilstm=bilstm,
                           dim=dim * 2,
                           relD=relD,
                           dropout=dropout)
    m.to(device)
    tt.tock("made model")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    params = m.parameters()
    sched = get_schedule(sched,
                         warmup=warmup,
                         t_total=totalsteps,
                         cycles=cycles)
    # optim = BertAdam(params, lr=lr, weight_decay=wreg, warmup=warmup, t_total=totalsteps, schedule=schedmap[sched])
    optim = BertAdam(params, lr=lr, weight_decay=wreg, schedule=sched)
    losses = [
        q.SmoothedCELoss(smoothing=smoothing,
                         weight=1 /
                         relcounts.clamp_min(1e-6) if classweighted else None),
        q.Accuracy()
    ]
    xlosses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=m,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=m,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=m,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(m, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))
        # save relation dictionary
        # json.dump(relD, open(os.path.join(savedir, "relD.json"), "w"))
        # save test predictions
        testpreds = q.eval_loop(m, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.test.npy"), testpreds)
        testpreds = q.eval_loop(m, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.dev.npy"), testpreds)
        tt.msg("saved in {}".format(savedir))
        # save bert-tokenized questions
        # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        # with open(os.path.join(savedir, "testquestions.txt"), "w") as f:
        #     for batch in evalloader:
        #         ques, io = batch
        #         ques = ques.numpy()
        #         for question in ques:
        #             qstr = " ".join([x for x in tokenizer.convert_ids_to_tokens(question) if x != "[PAD]"])
        #             f.write(qstr + "\n")

        tt.tock("done")
Example #7
0
def run_both(
    lr=DEFAULT_LR,
    dropout=.5,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    evalbatsize=-1,
    epochs=10,
    smoothing=DEFAULT_SMOOTHING,
    cuda=False,
    gpu=0,
    balanced=False,
    maskmention=False,
    warmup=-1.,
    sched="ang",
    cycles=-1.,
    savep="exp_bert_both_",
    test=False,
    freezeemb=False,
    large=False,
    datafrac=1.,
    savemodel=False,
):
    settings = locals().copy()
    print(locals())
    tt = q.ticktock("script")
    if evalbatsize < 0:
        evalbatsize = batsize
    tt.msg("running borders and rel classifier with BERT")
    if test:
        epochs = 0
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    if cycles == -1:
        if sched == "cos":
            cycles = 0.5
        elif sched in ["cosrestart", "coshardrestart"]:
            cycles = 1.0

    # region data
    tt.tick("loading data")
    data = load_data(which="forboth", retrelD=True, datafrac=datafrac)
    trainds, devds, testds, relD = data
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=evalbatsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=evalbatsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:1])
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:1])
    evalloader = DataLoader(evalds, batch_size=evalbatsize, shuffle=False)
    evalloader_dev = DataLoader(evalds_dev,
                                batch_size=evalbatsize,
                                shuffle=False)
    if test:
        evalloader = DataLoader(TensorDataset(*evalloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
        testloader = DataLoader(TensorDataset(*testloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
    print("number of relations: {}".format(len(relD)))
    # endregion

    # region model
    tt.tick("loading BERT")
    whichbert = "bert-base-uncased"
    if large:
        whichbert = "bert-large-uncased"
    bert = BertModel.from_pretrained(whichbert)
    m = BordersAndRelationClassifier(bert,
                                     relD,
                                     dropout=dropout,
                                     mask_entity_mention=maskmention)
    m.to(device)
    tt.tock("loaded BERT")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    assert (initwreg == 0.)
    initl2penalty = InitL2Penalty(bert, factor=q.hyperparam(initwreg))

    params = []
    for paramname, param in m.named_parameters():
        if paramname.startswith("bert.embeddings.word_embeddings"):
            if not freezeemb:
                params.append(param)
        else:
            params.append(param)
    sched = get_schedule(sched,
                         warmup=warmup,
                         t_total=totalsteps,
                         cycles=cycles)
    optim = BertAdam(params, lr=lr, weight_decay=wreg, schedule=sched)
    tmodel = BordersAndRelationLosses(m, cesmoothing=smoothing)
    # xmodel = BordersAndRelationLosses(m, cesmoothing=smoothing)
    # losses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    # xlosses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    tlosses = [q.SelectedLinearLoss(i) for i in range(7)]
    xlosses = [q.SelectedLinearLoss(i) for i in range(7)]
    trainlosses = [q.LossWrapper(l) for l in tlosses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=tmodel,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=tmodel,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=tmodel,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    m.clip_len = True
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    settings["testres"] = testres
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        print(savedir)
        # save model
        if savemodel:
            torch.save(m, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))
        # save relation dictionary
        # json.dump(relD, open(os.path.join(savedir, "relD.json"), "w"))
        # save test predictions
        m.clip_len = False
        # TEST data
        testpreds = q.eval_loop(m, evalloader, device=device)
        borderpreds = testpreds[0].cpu().detach().numpy()
        relpreds = testpreds[1].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.test.npy"), borderpreds)
        np.save(os.path.join(savedir, "relpreds.test.npy"), relpreds)
        # DEV data
        testpreds = q.eval_loop(m, evalloader_dev, device=device)
        borderpreds = testpreds[0].cpu().detach().numpy()
        relpreds = testpreds[1].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.dev.npy"), borderpreds)
        np.save(os.path.join(savedir, "relpreds.dev.npy"), relpreds)
        # save bert-tokenized questions
        # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        # with open(os.path.join(savedir, "testquestions.txt"), "w") as f:
        #     for batch in evalloader:
        #         ques, io = batch
        #         ques = ques.numpy()
        #         for question in ques:
        #             qstr = " ".join([x for x in tokenizer.convert_ids_to_tokens(question) if x != "[PAD]"])
        #             f.write(qstr + "\n")

        tt.tock("done")
Example #8
0
def run_relations(
    lr=DEFAULT_LR,
    dropout=.5,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    epochs=10,
    smoothing=DEFAULT_SMOOTHING,
    cuda=False,
    gpu=0,
    balanced=False,
    maskentity=False,
    warmup=-1.,
    sched="ang",
    savep="exp_bert_rels_",
    test=False,
    freezeemb=False,
):
    settings = locals().copy()
    if test:
        epochs = 0
    print(locals())
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    tt = q.ticktock("script")
    tt.msg("running relation classifier with BERT")
    tt.tick("loading data")
    data = load_data(which="rel+borders", retrelD=True)
    trainds, devds, testds, relD = data
    if maskentity:
        trainds, devds, testds = replace_entity_span(trainds, devds, testds)
    else:
        trainds, devds, testds = [
            TensorDataset(ds.tensors[0], ds.tensors[2])
            for ds in [trainds, devds, testds]
        ]
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=batsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=batsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:1])
    evalloader = DataLoader(evalds, batch_size=batsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:1])
    evalloader_dev = DataLoader(evalds_dev, batch_size=batsize, shuffle=False)
    if test:
        evalloader = DataLoader(TensorDataset(*evalloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
        testloader = DataLoader(TensorDataset(*testloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
    # endregion

    # region model
    tt.tick("loading BERT")
    bert = BertModel.from_pretrained("bert-base-uncased")
    m = RelationClassifier(bert, relD, dropout=dropout)
    m.to(device)
    tt.tock("loaded BERT")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs

    params = []
    for paramname, param in m.named_parameters():
        if paramname.startswith("bert.embeddings.word_embeddings"):
            if not freezeemb:
                params.append(param)
        else:
            params.append(param)
    optim = BertAdam(params,
                     lr=lr,
                     weight_decay=wreg,
                     warmup=warmup,
                     t_total=totalsteps,
                     schedule=schedmap[sched],
                     init_weight_decay=initwreg)
    losses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    xlosses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=m,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=m,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=m,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(m, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))
        # save relation dictionary
        # json.dump(relD, open(os.path.join(savedir, "relD.json"), "w"))
        # save test predictions
        testpreds = q.eval_loop(m, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.test.npy"), testpreds)
        testpreds = q.eval_loop(m, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.dev.npy"), testpreds)
        # save bert-tokenized questions
        # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        # with open(os.path.join(savedir, "testquestions.txt"), "w") as f:
        #     for batch in evalloader:
        #         ques, io = batch
        #         ques = ques.numpy()
        #         for question in ques:
        #             qstr = " ".join([x for x in tokenizer.convert_ids_to_tokens(question) if x != "[PAD]"])
        #             f.write(qstr + "\n")

        tt.tock("done")
Example #9
0
def run_span_borders(
    lr=DEFAULT_LR,
    dropout=.5,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    epochs=DEFAULT_EPOCHS,
    smoothing=DEFAULT_SMOOTHING,
    cuda=False,
    gpu=0,
    balanced=False,
    warmup=-1.,
    sched="ang",
    savep="exp_bert_span_borders_",
    freezeemb=False,
):
    settings = locals().copy()
    print(locals())
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    tt = q.ticktock("script")
    tt.msg("running span border with BERT")
    tt.tick("loading data")
    data = load_data(which="span/borders")
    trainds, devds, testds = data
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=batsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=batsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:-1])
    evalloader = DataLoader(evalds, batch_size=batsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:-1])
    evalloader_dev = DataLoader(evalds_dev, batch_size=batsize, shuffle=False)
    # endregion

    # region model
    tt.tick("loading BERT")
    bert = BertModel.from_pretrained("bert-base-uncased")
    spandet = BorderSpanDetector(bert, dropout=dropout)
    spandet.to(device)
    tt.tock("loaded BERT")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    params = []
    for paramname, param in spandet.named_parameters():
        if paramname.startswith("bert.embeddings.word_embeddings"):
            if not freezeemb:
                params.append(param)
        else:
            params.append(param)
    optim = BertAdam(params,
                     lr=lr,
                     weight_decay=wreg,
                     warmup=warmup,
                     t_total=totalsteps,
                     schedule=schedmap[sched])
    losses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(reduction="none"),
        q.SeqAccuracy()
    ]
    xlosses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(reduction="none"),
        q.SeqAccuracy()
    ]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=spandet,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=spandet,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=spandet,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(spandet, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))
        # save test predictions
        testpreds = q.eval_loop(spandet, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.test.npy"), testpreds)
        # save dev predictions
        testpreds = q.eval_loop(spandet, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.dev.npy"), testpreds)
        tt.tock("done")
Example #10
0
def run_relations(
    lr=DEFAULT_LR,
    dropout=.3,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    epochs=10,
    smoothing=DEFAULT_SMOOTHING,
    cuda=False,
    gpu=0,
    balanced=False,
    maskentity=False,
    savep="exp_bilstm_rels_",
    test=False,
    datafrac=1.,
    glove=False,
    embdim=50,
    dim=300,
    numlayers=2,
    warmup=0.0,
    cycles=0.5,
    sched="cos",
    evalbatsize=-1,
    classweighted=False,
    fixembed=False,
):
    print(locals())
    settings = locals().copy()
    if evalbatsize < 0:
        evalbatsize = batsize
    if test:
        epochs = 0
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    tt = q.ticktock("script")
    tt.msg("running relation classifier with BiLSTM")
    tt.tick("loading data")
    data = load_data(which="wordmat,wordborders,rels",
                     datafrac=datafrac,
                     retrelD=True)
    trainds, devds, testds, wD, relD = data
    rev_wD = {v: k for k, v in wD.items()}

    def pp(ids):
        ret = " ".join(
            [rev_wD[idse.item()] for idse in ids if idse.item() != 0])
        return ret

    print(pp(trainds.tensors[0][0]))
    print(trainds.tensors[1][0])
    if maskentity:
        trainds, devds, testds = replace_entity_span(trainds,
                                                     devds,
                                                     testds,
                                                     D=wD)
    else:
        trainds, devds, testds = [
            TensorDataset(ds.tensors[0], ds.tensors[2])
            for ds in [trainds, devds, testds]
        ]

    for i in range(10):
        question = trainds.tensors[0][i]
        print(pp(question))
    print()
    for i in range(10):
        question = devds.tensors[0][i]
        print(pp(question))
    print()
    for i in range(10):
        question = testds.tensors[0][i]
        print(pp(question))

    relcounts = torch.zeros(max(relD.values()) + 1)
    trainrelcounts = torch.tensor(
        np.bincount(trainds.tensors[1].detach().cpu().numpy()))
    relcounts[:len(trainrelcounts)] += trainrelcounts.float()
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=evalbatsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=evalbatsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:1])
    evalloader = DataLoader(evalds, batch_size=evalbatsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:1])
    evalloader_dev = DataLoader(evalds_dev,
                                batch_size=evalbatsize,
                                shuffle=False)

    if test:
        evalloader = DataLoader(TensorDataset(*evalloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
        testloader = DataLoader(TensorDataset(*testloader.dataset[:10]),
                                batch_size=batsize,
                                shuffle=False)
    # endregion

    # region model
    tt.tick("making model")
    emb = q.WordEmb(embdim, worddic=wD)
    if glove:
        print("using glove")
        stoi_, vectors_, dim = torch.load(
            "../../data/buboqa/data/sq_glove300d.pt")
        # map vectors from custom glove ids to wD ids
        vectors = torch.zeros(max(wD.values()) + 1,
                              embdim,
                              device=vectors_.device,
                              dtype=vectors_.dtype)
        stoi = {}
        for k, v in stoi_.items():
            if k in wD:
                vectors[wD[k]] = vectors_[v]
                stoi[k] = wD[k]
        print("{} words in stoi that are in wD".format(len(stoi)))
        gloveemb = q.WordEmb(embdim, worddic=stoi, _weight=vectors)
        # gloveemb = q.WordEmb.load_glove("glove.{}d".format(embdim), selectD=wD)
        if fixembed:
            gloveemb.freeze()
            emb.freeze()
        emb = q.SwitchedWordEmb(emb).override(gloveemb)

    bilstm = q.rnn.LSTMEncoder(embdim,
                               *([dim] * numlayers),
                               bidir=True,
                               dropout_in=dropout)
    # bilstm = torch.nn.LSTM(embdim, dim, batch_first=True, num_layers=numlayers, bidirectional=True, dropout=dropout)
    m = RelationClassifier(emb=emb,
                           bilstm=bilstm,
                           dim=dim,
                           relD=relD,
                           dropout=dropout)
    m.to(device)

    # model = RelationPrediction(config)
    tt.tock("made model")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    params = m.parameters()
    params = [param for param in params if param.requires_grad == True]
    sched = get_schedule(sched,
                         warmup=warmup,
                         t_total=totalsteps,
                         cycles=cycles)
    optim = BertAdam(params,
                     lr=lr,
                     weight_decay=wreg,
                     warmup=warmup,
                     t_total=totalsteps,
                     schedule=sched)
    # optim = torch.optim.Adam(params, lr=lr, weight_decay=wreg)
    # losses = [
    #     torch.nn.CrossEntropyLoss(size_average=True),
    #     q.Accuracy()
    # ]
    losses = [
        q.SmoothedCELoss(smoothing=smoothing,
                         weight=1 /
                         relcounts.clamp_min(1e-6) if classweighted else None),
        q.Accuracy()
    ]
    # xlosses = [
    #     torch.nn.CrossEntropyLoss(size_average=True),
    #     q.Accuracy()
    # ]
    xlosses = [q.SmoothedCELoss(smoothing=smoothing), q.Accuracy()]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=m,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=m,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=m,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(m, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))
        # save relation dictionary
        # json.dump(relD, open(os.path.join(savedir, "relD.json"), "w"))
        # save test predictions
        testpreds = q.eval_loop(m, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.test.npy"), testpreds)
        testpreds = q.eval_loop(m, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "relpreds.dev.npy"), testpreds)
        tt.msg("saved in {}".format(savedir))
        # save bert-tokenized questions
        # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        # with open(os.path.join(savedir, "testquestions.txt"), "w") as f:
        #     for batch in evalloader:
        #         ques, io = batch
        #         ques = ques.numpy()
        #         for question in ques:
        #             qstr = " ".join([x for x in tokenizer.convert_ids_to_tokens(question) if x != "[PAD]"])
        #             f.write(qstr + "\n")

        tt.tock("done")
Example #11
0
def run_span_borders(
    lr=DEFAULT_LR,
    dropout=.3,
    wreg=DEFAULT_WREG,
    initwreg=DEFAULT_INITWREG,
    batsize=DEFAULT_BATSIZE,
    evalbatsize=-1,
    epochs=DEFAULT_EPOCHS,
    smoothing=DEFAULT_SMOOTHING,
    dim=200,
    numlayers=1,
    cuda=False,
    gpu=0,
    savep="exp_bilstm_span_borders_",
    datafrac=1.,
    glove=False,
    fixembed=False,
    embdim=50,
    sched="cos",
    warmup=0.1,
    cycles=0.5,
):
    settings = locals().copy()
    print(locals())
    if evalbatsize < 0:
        evalbatsize = batsize
    if cuda:
        device = torch.device("cuda", gpu)
    else:
        device = torch.device("cpu")
    # region data
    tt = q.ticktock("script")
    tt.msg("running span border with BiLSTM")
    tt.tick("loading data")
    data = load_data(which="wordmat,wordborders", datafrac=datafrac)
    trainds, devds, testds, wD = data
    tt.tock("data loaded")
    tt.msg("Train/Dev/Test sizes: {} {} {}".format(len(trainds), len(devds),
                                                   len(testds)))
    trainloader = DataLoader(trainds, batch_size=batsize, shuffle=True)
    devloader = DataLoader(devds, batch_size=evalbatsize, shuffle=False)
    testloader = DataLoader(testds, batch_size=evalbatsize, shuffle=False)
    evalds = TensorDataset(*testloader.dataset.tensors[:1])
    evalloader = DataLoader(evalds, batch_size=evalbatsize, shuffle=False)
    evalds_dev = TensorDataset(*devloader.dataset.tensors[:1])
    evalloader_dev = DataLoader(evalds_dev,
                                batch_size=evalbatsize,
                                shuffle=False)
    # endregion

    # region model
    tt.tick("creating model")
    emb = q.WordEmb(embdim, worddic=wD)
    if glove:
        print("using glove")
        stoi_, vectors_, dim = torch.load(
            "../../data/buboqa/data/sq_glove300d.pt")
        # map vectors from custom glove ids to wD ids
        vectors = torch.zeros(max(wD.values()) + 1,
                              embdim,
                              device=vectors_.device,
                              dtype=vectors_.dtype)
        stoi = {}
        for k, v in stoi_.items():
            if k in wD:
                vectors[wD[k]] = vectors_[v]
                stoi[k] = wD[k]
        print("{} words in stoi that are in wD".format(len(stoi)))
        gloveemb = q.WordEmb(embdim, worddic=stoi, _weight=vectors)
        # gloveemb = q.WordEmb.load_glove("glove.{}d".format(embdim), selectD=wD)
        if fixembed:
            gloveemb.freeze()
        emb = q.SwitchedWordEmb(emb).override(gloveemb)
    # inpD = tokenizer.vocab
    # q.WordEmb.masktoken = "[PAD]"
    # emb = q.WordEmb(embdim, worddic=inpD)
    bilstm = q.rnn.LSTMEncoder(embdim,
                               *([dim] * numlayers),
                               bidir=True,
                               dropout_in_shared=dropout)
    spandet = BorderSpanDetector(emb, bilstm, dim * 2, dropout=dropout)
    spandet.to(device)
    tt.tock("model created")
    # endregion

    # region training
    totalsteps = len(trainloader) * epochs
    params = spandet.parameters()
    sched = get_schedule(sched,
                         warmup=warmup,
                         t_total=totalsteps,
                         cycles=cycles)
    optim = BertAdam(params, lr=lr, weight_decay=wreg, schedule=sched)
    # optim = torch.optim.Adam(spandet.parameters(), lr=lr, weight_decay=wreg)
    losses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(),
        q.SeqAccuracy()
    ]
    xlosses = [
        q.SmoothedCELoss(smoothing=smoothing),
        SpanF1Borders(),
        q.SeqAccuracy()
    ]
    trainlosses = [q.LossWrapper(l) for l in losses]
    devlosses = [q.LossWrapper(l) for l in xlosses]
    testlosses = [q.LossWrapper(l) for l in xlosses]
    trainloop = partial(q.train_epoch,
                        model=spandet,
                        dataloader=trainloader,
                        optim=optim,
                        losses=trainlosses,
                        device=device)
    devloop = partial(q.test_epoch,
                      model=spandet,
                      dataloader=devloader,
                      losses=devlosses,
                      device=device)
    testloop = partial(q.test_epoch,
                       model=spandet,
                       dataloader=testloader,
                       losses=testlosses,
                       device=device)

    tt.tick("training")
    q.run_training(trainloop, devloop, max_epochs=epochs)
    tt.tock("done training")

    tt.tick("testing")
    testres = testloop()
    print(testres)
    tt.tock("tested")

    if len(savep) > 0:
        tt.tick("making predictions and saving")
        i = 0
        while os.path.exists(savep + str(i)):
            i += 1
        os.mkdir(savep + str(i))
        savedir = savep + str(i)
        # save model
        # torch.save(spandet, open(os.path.join(savedir, "model.pt"), "wb"))
        # save settings
        json.dump(settings, open(os.path.join(savedir, "settings.json"), "w"))

        outlen = trainloader.dataset.tensors[0].size(1)
        spandet.outlen = outlen

        # save test predictions
        testpreds = q.eval_loop(spandet, evalloader, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.test.npy"), testpreds)
        # save dev predictions
        testpreds = q.eval_loop(spandet, evalloader_dev, device=device)
        testpreds = testpreds[0].cpu().detach().numpy()
        np.save(os.path.join(savedir, "borderpreds.dev.npy"), testpreds)
        tt.msg("saved in {}".format(savedir))
        tt.tock("done")