def setUp(self):
     enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20)
     x = np.random.randint(0, 100, (33, 5))
     o = enc.autobuild(x)
     self.o = o[1][0]
     m = MatchScore(enc, enc)
     mo = m.autobuild(x, x)
     self.mo = mo[1][0]
Exemple #2
0
 def setUp(self):
     enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20)
     x = np.random.randint(0, 100, (33, 5))
     o = enc.autobuild(x)
     self.o = o[1][0]
     m = MatchScore(enc, enc)
     mo = m.autobuild(x, x)
     self.mo = mo[1][0]
Exemple #3
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0]+1, dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num+1])
        self.assertRaises(Exception, self.cemb.predict, [num+1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block, self.glove.block)     # TODO factor out matchscore tests
        idxs = np.arange(num+1)

        # glove against glove
        self.assertTrue(np.allclose(mg.predict([num, 100], [num, 100]),
                                   [np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2]))

        class NegIdxGen():
            def __init__(self, num): self.n = num
            def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape)

        m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\
            .adagrad(lr=0.1)\
            .train(numbats=50, epochs=50)

        print m.predict([num, num-1, num-2, num-1], [num, num-1, num-2, num-2])

        mrr = 0.0
        recat10 = 0.0
        recat1 = 0.0
        tot = num + 1
        for a in range(tot):
            abc = zip(range(num+1), list(m.predict([a]*(num+1), np.arange(0, num+1))))
            abc = sorted(abc, key=lambda (x, y): y, reverse=True)
            #print abc[:10]
            for i in range(len(abc)):
                if abc[i][0] == a:
                    #print i
                    mrr += 1./(1+i)
                    if i < 10:
                        recat10 += 1
                    if i < 1:
                        recat1 += 1
                    break

        mrr /= tot
        recat10 /= tot
        recat1 /= tot
        print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
Exemple #4
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        vdata = np.arange(num)
        negrate = 5

        def obj(p, n):
            return n - p
        m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\
            .adagrad(lr=0.1).objective(obj) \
            .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \
            .train(numbats=50, epochs=29, returnerrors=True)
        #.writeresultstofile("testingresultswriter.tsv") \

        tdata = np.arange(num)
        tt = ticktock("eval")
        tt.tick()
        mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata)
        tt.tock("evaluated test data")
        print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
        print verr
        self.assertTrue(
            np.allclose(np.asarray([mrr, recat1, recat10]),
                        np.asarray(verr[-1][1:])))
Exemple #5
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
    embdim=100,
    innerdim=200,
    wreg=0.00005,
    bidir=False,
    mem=False,
    membidir=False,
    memlayers=1,
    layers=1,
    testfirst=False,
    rankingloss=False,
    rlmargin=1.,
    charlevel=False,
    pool=False,
    resultsave=False,
    resultsavep="subjdetns.res.pkl",
):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                         inpembdim=embdim,
                         innerdim=encinnerdim,
                         maskid=-1,
                         bidir=bidir,
                         pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb  # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                             inpembdim=memembdim,
                             inpemb=meminpemb,
                             innerdim=innerdim,
                             maskid=-1,
                             bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)  # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)  # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):  # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):  # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer,
                            worddic=worddic,
                            entdic=entdic,
                            metrics=[ClassAccuracy(),
                                     RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer,
                        worddic=worddic,
                        entdic=entdic,
                        metrics=[
                            ClassAccuracy(),
                            RecallAt(1),
                            RecallAt(2),
                            RecallAt(5),
                            RecallAt(10)
                        ])

    evalres = eval.eval(testdata,
                        testgold,
                        transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre
Exemple #6
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.0,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, : len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(
        indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir
    )
    dist = CosineDistance() if cosine else EuclideanDistance()  # DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)

    """
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    """

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = (
        scorer.nstrain([charwordmat, np.arange(len(words) + 1)])
        .negsamplegen(NegIdxGen(len(words)))
        .negrate(negrate)
        .objective(obj)
        .adagrad(lr=lr)
        .l2(wreg)
        .train(numbats=numbats, epochs=epochs)
    )

    cwenc.save("glove2c2w.block")
Exemple #7
0
def run(
        epochs=10,
        numbats=100,
        negrate=1,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
        embdim=100,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        mem=False,
        membidir=False,
        memlayers=1,
        layers=1,
        testfirst=False,
        rankingloss=False,
        rlmargin=1.,
        charlevel=False,
        pool=False,
        resultsave=False,
        resultsavep="subjdetns.res.pkl",
        ):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim/2]*layers
    else:
        encinnerdim = [innerdim]*layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                        inpembdim=embdim,
                        innerdim=encinnerdim,
                        maskid=-1,
                        bidir=bidir,
                        pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim/2]*memlayers
        else:
            innerdim = [innerdim]*memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb     # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                                inpembdim=memembdim,
                                inpemb=meminpemb,
                                innerdim=innerdim,
                                maskid=-1,
                                bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)       # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)                # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):    # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):    # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)])

    evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre
Exemple #8
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        hingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 1
        printpreds = True
        preeval = True
        # specemb = 100
        margin = 1.
        evalsplits = 1
        # usetypes=True
        mode = "charword"
        # checkdata = True

    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic \
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")
    # transform for predpred
    traingold = traingold[:, 1] - relstarts
    validgold = validgold[:, 1] - relstarts
    testgold = testgold[:, 1] - relstarts

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)
    predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform")
    inpenc = SimpleSeq2Vec(inpemb=emb,
                           inpembdim=emb.outdim,
                           innerdim=encinnerdim,
                           maskid=maskid,
                           bidir=bidir,
                           layers=layers)

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    scorer = MatchScore(inpenc, predemb, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, (gold.shape[0], ))
            return datas, predrand.astype("int32")

    # embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
        .negsamplegen(NegIdxGen(numents - relstarts))\
        .negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    # eval
    canids = np.arange(start=0, stop=numents - relstarts)
    predembs = predemb.predict(canids)  # (numrels, embdim)
    tt.tick("evaluating")
    predencs = inpenc.predict(testdata)  # (batsize, embdim)
    scores = np.zeros((predencs.shape[0], predembs.shape[0]))
    for i in range(predencs.shape[0]):
        scores[i, :] = \
            scorer.s.predict(np.repeat(predencs[np.newaxis, i],
                                       predembs.shape[0], axis=0),
                             predembs)
        tt.progress(i, predencs.shape[0], live=True)
    best = np.argmax(scores, axis=1)
    sortedbest = [
        sorted(zip(np.arange(scores.shape[1]), list(scores[i])),
               reverse=True,
               key=lambda (x, y): y) for i in range(scores.shape[0])
    ]
    sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest],
                             dtype="int32")
    # MRR
    mrr = 0.0
    for i in range(sortedbestmat.shape[1]):
        mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1)
    mrr /= testgold.shape[0]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]

    # R@X
    def ratx(ratnum):
        return rat(ratnum, sortedbestmat, testgold)

    def rat(ratnum, sortedpred, gold):
        acc = 0.0
        for i in range(min(ratnum, sortedbestmat.shape[1])):
            acc += 1.0 * np.sum(sortedpred[:, i] == gold)
        acc /= testgold.shape[0]
        return acc

    print "Accuracy: {}%".format(accuracy * 100)
    print "MRR: {}".format(mrr)
    print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format(
        ratx(10) * 100,
        ratx(50) * 100,
        ratx(100) * 100)
    embed()

    tt.tock("evaluated")
Exemple #9
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(
        len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, :len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(indim=len(chars),
                          inpembdim=embdim,
                          innerdim=encdim / 2 if bidir else encdim,
                          maskid=-1,
                          bidir=bidir)
    dist = CosineDistance() if cosine else EuclideanDistance()  #DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)
    '''
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    '''
    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\
        .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\
        .objective(obj).adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    cwenc.save("glove2c2w.block")
Exemple #10
0
def run(epochs=50,
        numbats=700,
        lr=1.,
        wreg=0.000001,
        bidir=False,
        layers=1,
        embdim=200,
        encdim=400,
        decdim=400,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        checkdata=False,
        predencode=False,
        closenegsam=False,
        glove=False,
        atleastcan=0,
        wordchar=False,
        charencmode="rnn",  # rnn or cnn
        totalrandomtest=False,
        rarewords=0,
        ):
    maskid = -1
    tt = ticktock("predpred")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar)

    if closenegsam:
        revsamplespace, revind = buildsamplespace(entmat, worddic)

    tt.tock("data loaded")
    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def pp(widxs):
            print " ".join([rwd[x] if x in rwd else "" for x in widxs])
        embed()

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1

    if rarewords > 0:
        rwd = {v: k for k, v in worddic.items()}
        print "doing rare words"
        trainwordcounts = getmatrixvaluecounts(traindata, entmat)
        stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True)
        fstwc = filter(lambda (x, y): y > rarewords, stwc)
        redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd],
                           range(1, len(fstwc)+1)))
        redwdic["<RARE>"] = 0
        #embed()
    if bidir:
        encdim = [encdim / 2] * layers
    else:
        encdim = [encdim] * layers

    # question-side model
    if glove:
        if rarewords > 0:
            raise Exception("glove with rare words currently not supported")
        wordemb = Glove(embdim).adapt(worddic)
    else:
        if rarewords > 0:
            wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic)
            #embed()
        else:
            wordemb = WordEmb(dim=embdim, worddic=worddic)
    if wordchar:
        print "wordchar model"
        numchars = 256
        if charencmode == "cnn":
            print "using CNN char encoder"
            charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2,
                                    maskid=maskid, stride=1)
            wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim,
                                    innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE)
            question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb,
                                           l2enc=wordenc, maskid=maskid)
        else:
            question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim,
                                           wordemb=wordemb, wordinnerdim=encdim, maskid=maskid,
                                           bidir=bidir)
    else:
        question_enc = SimpleSeq2Vec(inpemb=wordemb,
                                     inpembdim=wordemb.outdim,
                                     innerdim=encdim,
                                     maskid=maskid,
                                     bidir=bidir,
                                     layers=layers)

    # predicate-side model
    if predencode:
        predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)
                         )
        predemb.load(entmat)
        """
        predemb = SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)

        class PreProc(object):
            def __init__(self, entmat):
                self.f = PreProcE(entmat)

            def __call__(self, encdata, decgold):
                return (encdata, self.f(decgold)[0][0]), {}

        class PreProcE(object):
            def __init__(self, entmat):
                self.em = Val(entmat)

            def __call__(self, x):
                return (self.em[x],), {}

        transf = PreProc(entmat)
        predtransf = transf.f
        """
    else:
        predemb = VectorEmbed(numents, decdim)
        """transf = None
        predtransf = None"""

    # scoring
    scorer = MatchScore(question_enc, predemb, scorer=CosineDistance())

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, gold.shape)
            return datas, predrand.astype("int32")

    class NegIdxGenClose(object):
        def __init__(self, revsamsp, rng):
            self.revsamsp = revsamsp
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            ret = np.zeros_like(gold)
            for i in range(gold.shape[0]):
                sampleset = self.revsamsp[gold[i]]
                if len(sampleset) > 5:
                    ret[i] = random.sample(sampleset, 1)[0]
                else:
                    ret[i] = np.random.randint(self.min, self.max)
            #embed()
            return datas, ret.astype("int32")


    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    if closenegsam:
        tt.msg("using close neg sampler")
        negidxgen = NegIdxGenClose(revsamplespace, numents)
    else:
        negidxgen = NegIdxGen(numents)

    checkembschange = True
    if checkembschange:
        #embed()
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        embvals = embvar.d.get_value()
    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
                .negsamplegen(negidxgen) \
                .negrate(negrate) \
                .objective(obj) \
                .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
                .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")
    if checkembschange:
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        newembvals = embvar.d.get_value()
        embschanged = not np.allclose(embvals, newembvals)
        sumsqdiff = np.sum((newembvals - embvals)**2)
        print "Embeddings {}: {} sum of square diffs"\
            .format("changed" if embschanged else "did not change", sumsqdiff)

    # evaluation
    tt.tick("evaluating")
    qenc_pred = question_enc.predict(testdata)
    scores = []
    dontembed = True
    if atleastcan > 0:
        print "ensuring at least {} cans".format(atleastcan)
    if totalrandomtest:
        print "total randomness"
    for i in range(qenc_pred.shape[0]):
        if totalrandomtest:
            cans = [testgold[i]]
        else:
            cans = testsubjsrels[i][0] #+ testsubjsrels[i][1]
        if len(cans) < atleastcan:
            extracans = list(np.random.randint(0, numents, (atleastcan+50,)))
            extracans = list(set(extracans).difference(set(cans)))
            cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))]
            #print len(cans), cans
        if not dontembed:
            embed()
        #cans = set(cans)
        #if atleastcan > 0:
        #    while len(cans) < atleastcan:
        #        rancan = np.random.randint(0, numents)
        #        if rancan not in cans:
        #            cans.add(rancan)
        #cans = list(cans)
        if len(cans) == 0:
            scores.append([(-1, -np.infty)])
            continue
        #canembs = predemb.predict.transform(predtransf)(cans)
        canembs = predemb.predict(cans)
        scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i],
                                             canembs.shape[0], axis=0),
                                   canembs)
        scores.append(zip(cans, scoresi))
        if debug:
            embed()
        tt.progress(i, qenc_pred.shape[0], live=True)
    sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores]
    best = [sortedbesti[0][0] for sortedbesti in sortedbest]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]


    print("Accuracy: {}%".format(accuracy * 100))
Exemple #11
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\
            .adagrad(lr=0.1)\
            .train(numbats=50, epochs=50)

        print m.predict([num, num - 1, num - 2, num - 1],
                        [num, num - 1, num - 2, num - 2])

        mrr = 0.0
        recat10 = 0.0
        recat1 = 0.0
        tot = num + 1
        for a in range(tot):
            abc = zip(range(num + 1),
                      list(m.predict([a] * (num + 1), np.arange(0, num + 1))))
            abc = sorted(abc, key=lambda (x, y): y, reverse=True)
            #print abc[:10]
            for i in range(len(abc)):
                if abc[i][0] == a:
                    #print i
                    mrr += 1. / (1 + i)
                    if i < 10:
                        recat10 += 1
                    if i < 1:
                        recat1 += 1
                    break

        mrr /= tot
        recat10 /= tot
        recat1 /= tot
        print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)