def load_data(p="../../../datasets/semparse/", which=None, devfrac=0.1, devfracrandom=False):
    tt = q.ticktock("dataloader")
    tt.tick("loading data")
    assert(which is not None)
    which = {"geo": "geoquery", "atis": "atis", "jobs": "jobs"}[which]
    trainp = os.path.join(p, which, "train.txt")
    testp = os.path.join(p, which, "test.txt")
    devp = os.path.join(p, which, "dev.txt")

    trainlines = open(trainp).readlines()
    testlines = open(testp).readlines()

    if not os.path.exists(devp):
        tt.msg("no dev file, taking {} from training data".format(devfrac))
        splitidx = round(len(trainlines)*devfrac)
        trainlines = trainlines[:-splitidx]
        devlines = trainlines[-splitidx:]
    else:
        devlines = open(devp).readlines()

    tt.msg("{} examples in training set".format(len(trainlines)))
    tt.msg("{} examples in dev set".format(len(devlines)))
    tt.msg("{} examples in test set".format(len(testlines)))

    nlsm = q.StringMatrix(freqcutoff=1)
    nlsm.tokenize = lambda x: x.strip().split()
    qlsm = q.StringMatrix(indicate_start_end=True, freqcutoff=1)
    qlsm.tokenize = lambda x: x.strip().split()

    i = 0
    for line in trainlines:
        nl, ql = line.split("\t")
        nlsm.add(nl)
        qlsm.add(ql)
        i += 1

    nlsm.unseen_mode = True
    qlsm.unseen_mode = True

    devstart = i

    for line in devlines:
        nl, ql = line.split("\t")
        nlsm.add(nl)
        qlsm.add(ql)
        i += 1

    teststart = i

    for line in testlines:
        nl, ql = line.split("\t")
        nlsm.add(nl)
        qlsm.add(ql)

    nlsm.finalize()
    qlsm.finalize()
    tt.tock("data loaded")

    return nlsm, qlsm, (devstart, teststart)
def run(p1=DATA_PATH+"valid_dialogues.json",
        p2=DATA_PATH+"valid_dialogues.json",        # change the file paths to use train and valid (so the ids are shared)
        maxwords=int(1e9), rarefreq=0):
    """
    Saves in DATA_PATH, see code for exact paths
    :param p1:          path to train json
    :param p2:          path to valid json
    :param maxwords:    maximum number of words in vocab
    :param rarefreq:    word frequency for rare words
    :return:
    """
    sm = q.StringMatrix(topnwords=maxwords, freqcutoff=rarefreq)
    sm.tokenize = lambda x: x.split()
    out_struct1, sm, us = load_datafile(p1, sm)
    sm.unseen_mode = True
    out_struct2, sm, us2 = load_datafile(p2, sm, uniquestrings=us)
    sm.finalize()
    ## !!! dictionary is in sm.D, numpy array is in sm.matrix
    assert(us == us2)
    print("done: {} unique strings \n\n".format(len(us)))
    json.dump(out_struct1, open(DATA_PATH + "train_dialogues.struct.json", "w"))
    json.dump(out_struct2, open(DATA_PATH + "valid_dialogues.struct.json", "w"))
    json.dump(sm.D, open(DATA_PATH+"dialogues.strings.dict", "w"))
    np.save(DATA_PATH+"dialogues.strings.mat", sm.matrix)
    print("saved")
    return out_struct1, out_struct2, sm
def run_toy(lr=0.001,
            seqlen=8,
            batsize=10,
            epochs=1000,
            embdim=32,
            innerdim=64,
            z_dim=32,
            noaccumulate=False,
            usebase=False,
            ):
    # generate some toy data
    N = 1000
    data, vocab = gen_toy_data(N, seqlen=seqlen, mode="copymiddlefixed")
    datasm = q.StringMatrix()
    datasm.set_dictionary(vocab)
    datasm.tokenize = lambda x: list(x)
    for data_e in data:
        datasm.add(data_e)
    datasm.finalize()

    real_data = q.dataset(datasm.matrix)
    gen_data_d = q.gan.gauss_dataset(z_dim, len(real_data))
    disc_data = q.datacat([real_data, gen_data_d], 1)

    gen_data = q.gan.gauss_dataset(z_dim)

    disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True)
    gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True)

    discriminator = Discriminator(datasm.D, embdim, innerdim)
    generator = Decoder(datasm.D, embdim, z_dim, "<START>", innerdim, maxtime=seqlen)

    SeqGAN = SeqGAN_Base if usebase else SeqGAN_DCL

    disc_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.DISC_TRAIN, accumulate=not noaccumulate)
    gen_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.GEN_TRAIN, accumulate=not noaccumulate)

    disc_optim = torch.optim.Adam(q.params_of(discriminator), lr=lr)
    gen_optim = torch.optim.Adam(q.params_of(generator), lr=lr)

    disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2))
    gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2))

    gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer)

    gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500)

    # print some predictions:
    with torch.no_grad():
        rvocab = {v: k for k, v in vocab.items()}
        q.batch_reset(generator)
        eval_z = torch.randn(50, z_dim)
        eval_y, _ = generator(eval_z)
        for i in range(len(eval_y)):
            prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()])
            print(prow)

    print("done")
def run_cond_toy(lr=0.001,
                 seqlen=8,
                 batsize=10,
                 epochs=1000,
                 embdim=5,
                 innerdim=32,
                 z_dim=5,
                 usebase=False,
                 nrexamples=1000):
    data, vocab = gen_toy_data(nrexamples, seqlen=seqlen, mode="twointerleaveboth")
    datasm = q.StringMatrix()
    datasm.set_dictionary(vocab)
    datasm.tokenize = lambda x: list(x)
    for data_e in data:
        datasm.add(data_e)
    datasm.finalize()

    real_data = q.dataset(datasm.matrix)
    shuffled_datasm_matrix = datasm.matrix + 0
    np.random.shuffle(shuffled_datasm_matrix)
    fake_data = q.dataset(shuffled_datasm_matrix)
    disc_data = q.datacat([real_data, fake_data], 1)

    gen_data = q.dataset(datasm.matrix)

    disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True)
    gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True)

    discr = Discriminator(datasm.D, embdim, innerdim)
    decoder = Decoder_Cond(datasm.D, embdim, z_dim, "<START>", innerdim)

    disc_model = SeqGAN_Cond(discr, decoder, gan_mode=q.gan.GAN.DISC_TRAIN)
    gen_model = SeqGAN_Cond(discr, decoder, gan_mode=q.gan.GAN.GEN_TRAIN)

    disc_optim = torch.optim.Adam(q.params_of(discr), lr=lr)
    gen_optim = torch.optim.Adam(q.params_of(decoder), lr=lr)

    disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2))
    gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2))

    gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer)

    gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500)

    with torch.no_grad():
        rvocab = {v: k for k, v in vocab.items()}
        q.batch_reset(decoder)
        eval_z = torch.tensor(datasm.matrix[:50])
        eval_y, _, _, _ = decoder(eval_z)
        for i in range(len(eval_y)):
            prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()])
            print(prow)

    print("done")
Beispiel #5
0
def load_data(p="../../datasets/simplequestions/"):
    tt = q.ticktock("dataloader")
    tt.tick("loading")
    questions, subjects, subject_names, relations, spans, (start_valid, start_test) \
        = load_questions(p)
    generate_candidates(p)
    tt.tock("{} questions loaded".format(len(questions)))

    tt.tick("generating matrices")
    qsm = q.StringMatrix(freqcutoff=2)
    qsm.tokenize = lambda x: x.split()
    for question in tqdm.tqdm(questions[:start_valid]):
        qsm.add(question)
    qsm.unseen_mode = True
    for question in tqdm.tqdm(questions[start_valid:]):
        qsm.add(question)
    tt.msg("finalizing")
    qsm.finalize()
    print(qsm[0])
    q.embed()
    tt.tock("matrices generated")
def run_classify(lr=0.001,
                 seqlen=6,
                 numex=500,
                 epochs=25,
                 batsize=10,
                 test=True,
                 cuda=False,
                 gpu=0):
    device = torch.device("cpu")
    if cuda:
        device = torch.device("cuda", gpu)
    # region construct data
    colors = "red blue green magenta cyan orange yellow grey salmon pink purple teal".split(
    )
    D = dict(zip(colors, range(len(colors))))
    inpseqs = []
    targets = []
    for i in range(numex):
        inpseq = list(np.random.choice(colors, seqlen, replace=False))
        target = np.random.choice(range(len(inpseq)), 1)[0]
        target_class = D[inpseq[target]]
        inpseq[target] = "${}$".format(inpseq[target])
        inpseqs.append("".join(inpseq))
        targets.append(target_class)

    sm = q.StringMatrix()
    sm.tokenize = lambda x: list(x)

    for inpseq in inpseqs:
        sm.add(inpseq)

    sm.finalize()
    print(sm[0])
    print(sm.D)
    targets = np.asarray(targets)

    data = q.dataload(sm.matrix[:-100], targets[:-100], batch_size=batsize)
    valid_data = q.dataload(sm.matrix[-100:],
                            targets[-100:],
                            batch_size=batsize)
    # endregion

    # region model
    embdim = 20
    enc2inpdim = 45
    encdim = 20
    outdim = 20
    emb = q.WordEmb(embdim, worddic=sm.D)  # sm dictionary (characters)
    out = q.WordLinout(outdim, worddic=D)  # target dictionary
    # encoders:
    enc1 = q.RNNEncoder(embdim, encdim, bidir=True)
    enc2 = q.RNNCellEncoder(enc2inpdim, outdim // 2, bidir=True)

    # model
    class Model(torch.nn.Module):
        def __init__(self, dim, _emb, _out, _enc1, _enc2, **kw):
            super(Model, self).__init__(**kw)
            self.dim, self.emb, self.out, self.enc1, self.enc2 = dim, _emb, _out, _enc1, _enc2
            self.score = torch.nn.Sequential(
                torch.nn.Linear(dim, 1, bias=False), torch.nn.Sigmoid())
            self.emb_expander = ExpandVecs(embdim, enc2inpdim, 2)
            self.enc_expander = ExpandVecs(encdim * 2, enc2inpdim, 2)

        def forward(self, x, with_att=False):
            # embed and encode
            xemb, xmask = self.emb(x)
            xenc = self.enc1(xemb, mask=xmask)
            # compute attention
            xatt = self.score(xenc).squeeze(
                2) * xmask.float()[:, :xenc.size(1)]
            # encode again
            _xemb = self.emb_expander(xemb[:, :xenc.size(1)])
            _xenc = self.enc_expander(xenc)
            _, xenc2 = self.enc2(_xemb,
                                 gate=xatt,
                                 mask=xmask[:, :xenc.size(1)],
                                 ret_states=True)
            scores = self.out(xenc2.view(xenc.size(0), -1))
            if with_att:
                return scores, xatt
            else:
                return scores

    model = Model(40, emb, out, enc1, enc2)
    # endregion

    # region test
    if test:
        inps = torch.tensor(sm.matrix[0:2])
        outs = model(inps)
    # endregion

    # region train
    optimizer = torch.optim.Adam(q.params_of(model), lr=lr)
    trainer = q.trainer(model).on(data).loss(torch.nn.CrossEntropyLoss(), q.Accuracy())\
        .optimizer(optimizer).hook(q.ClipGradNorm(5.)).device(device)
    validator = q.tester(model).on(valid_data).loss(
        q.Accuracy()).device(device)
    q.train(trainer, validator).run(epochs=epochs)
    # endregion

    # region check attention    #TODO
    # feed a batch
    inpd = torch.tensor(sm.matrix[400:410])
    outd, att = model(inpd, with_att=True)
    outd = torch.max(outd, 1)[1].cpu().detach().numpy()
    inpd = inpd.cpu().detach().numpy()
    att = att.cpu().detach().numpy()
    rD = {v: k for k, v in sm.D.items()}
    roD = {v: k for k, v in D.items()}
    for i in range(len(att)):
        inpdi = "   ".join([rD[x] for x in inpd[i]])
        outdi = roD[outd[i]]
        print("input:     {}\nattention: {}\nprediction: {}".format(
            inpdi, " ".join(["{:.1f}".format(x) for x in att[i]]), outdi))
def run_words(lr=0.001,
              seqlen=8,
              batsize=50,
              epochs=1000,
              embdim=64,
              innerdim=128,
              z_dim=64,
              usebase=True,
              noaccumulate=False,
              ):
    # get some words
    N = 1000
    glove = q.PretrainedWordEmb(50, vocabsize=N+2)
    words = list(glove.D.keys())[2:]
    datasm = q.StringMatrix()
    datasm.tokenize = lambda x: list(x)
    for word in words:
        datasm.add(word)
    datasm.finalize()
    datamat = datasm.matrix[:, :seqlen]
    # replace <mask> with <end>
    datamat = datamat + (datamat == datasm.D["<MASK>"]) * (datasm.D["<END>"] - datasm.D["<MASK>"])


    real_data = q.dataset(datamat)
    gen_data_d = q.gan.gauss_dataset(z_dim, len(real_data))
    disc_data = q.datacat([real_data, gen_data_d], 1)

    gen_data = q.gan.gauss_dataset(z_dim)

    disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True)
    gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True)

    discriminator = Discriminator(datasm.D, embdim, innerdim)
    generator = Decoder(datasm.D, embdim, z_dim, "<START>", innerdim, maxtime=seqlen)

    SeqGAN = SeqGAN_Base if usebase else SeqGAN_DCL

    disc_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.DISC_TRAIN, accumulate=not noaccumulate)
    gen_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.GEN_TRAIN, accumulate=not noaccumulate)

    disc_optim = torch.optim.Adam(q.params_of(discriminator), lr=lr)
    gen_optim = torch.optim.Adam(q.params_of(generator), lr=lr)

    disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2))
    gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2))

    gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer)

    gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500)

    # print some predictions:
    with torch.no_grad():
        rvocab = {v: k for k, v in datasm.D.items()}
        q.batch_reset(generator)
        eval_z = torch.randn(50, z_dim)
        eval_y, _ = generator(eval_z)
        for i in range(len(eval_y)):
            prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()])
            print(prow)

    print("done")
Beispiel #8
0
def load_jsons(datap="../../../datasets/lcquad/newdata.json",
               relp="../../../datasets/lcquad/nrelations.json",
               mode="flat"):
    tt = q.ticktock("data loader")
    tt.tick("loading jsons")

    data = json.load(open(datap))
    rels = json.load(open(relp))

    tt.tock("jsons loaded")

    tt.tick("extracting data")
    questions = []
    goldchains = []
    badchains = []
    for dataitem in data:
        questions.append(dataitem["parsed-data"]["corrected_question"])
        goldchain = []
        for x in dataitem["parsed-data"]["path_id"]:
            goldchain += [x[0], int(x[1:])]
        goldchains.append(goldchain)
        badchainses = []
        goldfound = False
        for badchain in dataitem["uri"]["hop-1-properties"] + dataitem["uri"][
                "hop-2-properties"]:
            if goldchain == badchain:
                goldfound = True
            else:
                if len(badchain) == 2:
                    badchain += [-1, -1]
                badchainses.append(badchain)
        badchains.append(badchainses)

    tt.tock("extracted data")

    tt.msg("mode: {}".format(mode))

    if mode == "flat":
        tt.tick("flattening")

        def flatten_chain(chainspec):
            flatchainspec = []
            for x in chainspec:
                if x in ("+", "-"):
                    flatchainspec.append(x)
                elif x > -1:
                    relwords = rels[str(x)]
                    flatchainspec += relwords
                elif x == -1:
                    pass
                else:
                    raise q.SumTingWongException("unexpected symbol in chain")
            return " ".join(flatchainspec)

        goldchainids = []
        badchainsids = []

        uniquechainids = {}

        qsm = q.StringMatrix()
        csm = q.StringMatrix()
        csm.tokenize = lambda x: x.lower().strip().split()

        def get_ensure_chainid(flatchain):
            if flatchain not in uniquechainids:
                uniquechainids[flatchain] = len(uniquechainids)
                csm.add(flatchain)
                assert (len(csm) == len(uniquechainids))
            return uniquechainids[flatchain]

        eid = 0
        numchains = 0
        for question, goldchain, badchainses in zip(questions, goldchains,
                                                    badchains):
            qsm.add(question)
            # flatten gold chain
            flatgoldchain = flatten_chain(goldchain)
            chainid = get_ensure_chainid(flatgoldchain)
            goldchainids.append(chainid)
            badchainsids.append([])
            numchains += 1
            for badchain in badchainses:
                flatbadchain = flatten_chain(badchain)
                chainid = get_ensure_chainid(flatbadchain)
                badchainsids[eid].append(chainid)
                numchains += 1
            eid += 1
            tt.live("{}".format(eid))

        assert (len(badchainsids) == len(questions))
        tt.stoplive()
        tt.msg("{} unique chains from {} total".format(len(csm), numchains))
        qsm.finalize()
        csm.finalize()
        tt.tock("flattened")
        csm.tokenize = None
        return qsm, csm, goldchainids, badchainsids
    else:
        raise q.SumTingWongException("unsupported mode: {}".format(mode))