def setUp(self):
     words = "the a his monkey inception key earlgrey"
     wdic = dict(zip(words.split(), range(1, len(words.split()) + 1)))
     self.baseemb = WordEmb(dim=50, worddic=wdic, maskid=-1)
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     self.glove = Glove(50, 4000)
     self.emb = self.baseemb.override(self.glove)
Beispiel #2
0
def do_custom_emb(inpemb, outemb, awc, embdim):
    thresh = 10
    sawc = sorted(awc.items(), key=lambda (k, v): v, reverse=True)
    rarewords = {k for (k, v) in sawc if v < thresh}
    g = Glove(embdim)
    inpemb = inpemb.override(g)
    outemb = outemb.override(g, which=rarewords)
    return inpemb, outemb
Beispiel #3
0
def run_seq2idx(  # works after refactoring (with adagrad)
    wreg=0.0,
    epochs=75,
    numbats=20,
    lr=1.,
    statedim=100,
):
    # get words
    numchars = 27
    embdim = 50
    lm = Glove(embdim, 1000)
    words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    #wldf = pd.DataFrame(map(word2int, words)).fillna(0)
    #data = wldf.values.astype("int32")
    data = words2ints(words)
    #embs = lm.W[map(lambda x: lm * x, words), :]
    #embed()
    wordidxs = np.arange(0, len(words))
    print wordidxs[:5]
    print data[:5]
    print words[:5]
    numwords = wordidxs.shape[0]
    print "random seq neg log prob %.3f" % math.log(numchars**data.shape[1])
    testneglogprob = 17
    print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % (
        testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1]))

    testpred = ["the", "of", "to", "their", "in"]
    testpred = words2ints(testpred)
    print words[:5], words[35]
    ####testpred = np.eye(numchars, numchars)[testpred, :]

    wordidxsonehot = np.eye(numwords, numwords)[wordidxs, :]

    ####data = np.eye(numchars, numchars)[data, :]

    block = seq2idx(invocsize=numchars, outvocsize=numwords, innerdim=statedim)
    '''gru = GRU(innerdim=statedim, dim=numchars)
    lin = Lin(indim=statedim, dim=numwords)
    lin2 = Lin(indim=numwords, dim=numwords)
    block = asblock(lambda x: Softmax()(lin(gru(x)[:, -1, :])))'''
    ###block = asblock(lambda x: Softmax()(lin2(x)))
    '''
    print testpred
    probepred = np.argmax(block.predict(testpred), axis=1)
    print probepred

    for p in block.output.allparams:
        print p
    '''
    block.train([data], wordidxs).cross_entropy().adagrad(lr=lr).autovalidate().accuracy().validinter(5)\
         .train(numbats=numbats, epochs=epochs)

    #embed()
    pred = block.predict(testpred)
    print pred.shape
    print np.argmax(pred, axis=1)
Beispiel #4
0
 def __init__(self,
              indim=4000,
              outdim=100,
              embdim=50,
              embtrainfrac=0.0,
              **kw):
     super(WordEmbedPlusGlove, self).__init__(indim, outdim + embdim, **kw)
     self.glove = Glove(embdim, vocabsize=indim,
                        trainfrac=embtrainfrac).block
     self.emb = VectorEmbed(indim=indim, dim=outdim)
Beispiel #5
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        vdata = np.arange(num)
        negrate = 5

        def obj(p, n):
            return n - p
        m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\
            .adagrad(lr=0.1).objective(obj) \
            .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \
            .train(numbats=50, epochs=29, returnerrors=True)
        #.writeresultstofile("testingresultswriter.tsv") \

        tdata = np.arange(num)
        tt = ticktock("eval")
        tt.tick()
        mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata)
        tt.tock("evaluated test data")
        print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
        print verr
        self.assertTrue(
            np.allclose(np.asarray([mrr, recat1, recat10]),
                        np.asarray(verr[-1][1:])))
class TestGloveOverriding(TestCase):
    def setUp(self):
        words = "the a his monkey inception key earlgrey"
        wdic = dict(zip(words.split(), range(1, len(words.split()) + 1)))
        self.baseemb = WordEmb(dim=50, worddic=wdic, maskid=-1)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(50, 4000)
        self.emb = self.baseemb.override(self.glove)

    def test_embed_masker(self):
        v = Val(np.random.randint(-1, 5, (4, 3)))
        m = self.emb(v).mask
        self.assertTrue(np.all((v.v != -1) == m.v))

    def test_sameasglove(self):
        words = "key the a his"
        pred = self.emb.predict([self.emb * x for x in words.split()])
        gpred = self.glove.predict([self.glove * x for x in words.split()])
        self.assertTrue(np.allclose(pred, gpred))

    def test_sameasbase(self):
        words = "inception monkey earlgrey"
        pred = self.emb.predict([self.emb * x for x in words.split()])
        gpred = self.baseemb.predict([self.baseemb * x for x in words.split()])
        self.assertTrue(np.allclose(pred, gpred))

    def test_notasglove(self):
        words = "inception monkey earlgrey"
        pred = self.emb.predict([self.emb * x for x in words.split()])
        gpred = self.glove.predict([self.glove * x for x in words.split()])
        self.assertFalse(np.allclose(pred, gpred))

    def test_notasbase(self):
        words = "key the a his"
        pred = self.emb.predict([self.emb * x for x in words.split()])
        gpred = self.baseemb.predict([self.baseemb * x for x in words.split()])
        self.assertFalse(np.allclose(pred, gpred))
Beispiel #7
0
 def __init__(self,
              numchars=220,
              numwords=4e5,
              encdim=100,
              embdim=50,
              embtrainfrac=0.0,
              maskid=0,
              glovepath=None,
              **kw):
     super(WordEncoderPlusGlove, self).__init__(**kw)
     self.glove = Glove(embdim,
                        vocabsize=numwords,
                        trainfrac=embtrainfrac,
                        path=glovepath).block
     self.enc = WordEncoder(indim=numchars, outdim=encdim, maskid=maskid)
Beispiel #8
0
def run_seqdecatt(  # seems to work
    wreg=0.00001,
    epochs=50,
    numbats=50,
    lr=0.1,
    statedim=50,
    encdim=50,
    attdim=50,
    numwords=5000,
):
    # get words
    vocsize = 28
    lm = Glove(50, numwords)
    allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    #embed()
    invwords = [word[::-1] for word in allwords]
    data = words2ints(allwords)
    idata = words2ints(invwords)
    startsym = 0

    golddata = data

    #golddata = idata

    print data[:10]
    print shiftdata(data, startsym)[:10]

    testwords = [
        "the", "alias", "mock", "test", "stalin", "allahuakbar", "python",
        "pythonista"
    ]
    testpred = words2ints(testwords)

    block = SimpleSeqEncDecAtt(inpvocsize=vocsize,
                               outvocsize=vocsize,
                               encdim=encdim,
                               decdim=statedim,
                               attdim=attdim,
                               inconcat=False,
                               bidir=False,
                               statetrans=None)
    block.train([data, shiftdata(golddata, startsym)], golddata).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg) \
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(2) \
        .train(numbats=numbats, epochs=epochs)

    s = SeqEncDecSearch(block)
    pred, probs = s.decode(testpred, startsym, testpred.shape[1])
    print ints2words(pred), probs
Beispiel #9
0
def run_idx2seq(  # works after refactor
    wreg=0.000001,
    epochs=150,
    numbats=10,
    lr=0.1,
    statedim=70,
    encdim=100,
):
    # get words
    numchars = 27
    embdim = 50
    lm = Glove(embdim, 1000)
    words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    data = words2ints(words)
    sdata = shiftdata(data)
    wordidxs = np.arange(0, len(words))
    numwords = wordidxs.shape[0]
    print "random seq neg log prob %.3f" % math.log(numchars**data.shape[1])
    testneglogprob = 17
    print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % (
        testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1]))

    testpred = wordidxs[:15]
    testdata = data[:15]
    testsdata = sdata[:15]
    print testpred
    print testdata
    print testsdata
    #testpred = words2ints(testpred)

    block = idx2seq(encdim=encdim,
                    invocsize=numwords,
                    outvocsize=numchars,
                    innerdim=statedim,
                    seqlen=data.shape[1])
    print np.argmax(block.predict(testpred, testsdata), axis=2)
    print block.output.allparams
    block.train([wordidxs, sdata], data).seq_cross_entropy().grad_total_norm(0.5).adagrad(lr=lr).l2(wreg)\
         .autovalidate().seq_accuracy().validinter(5)\
         .train(numbats=numbats, epochs=epochs)

    pred = block.predict(testpred, testsdata)
    for word in ints2words(np.argmax(pred, axis=2)):
        print word
    embed()
Beispiel #10
0
def run_attentionseqdecoder(  # seems to work
        wreg=0.00001,  # TODO: regularization other than 0.0001 first stagnates, then goes down
        epochs=100,
        numbats=20,
        lr=0.1,
        statedim=50,
        encdim=50,
        attdim=50):

    # get words
    vocsize = 27
    embdim = 50
    lm = Glove(embdim, 2000)
    allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    words = allwords[:1000]
    vwords = allwords[1000:]
    data = words2ints(words)
    sdata = shiftdata(data)
    vdata = words2ints(vwords)
    svdata = shiftdata(vdata)
    print "random seq neg log prob %.3f" % math.log(vocsize**data.shape[1])
    testneglogprob = 17
    print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % (
        testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1]))

    testpred = [
        "the", "alias", "mock", "test", "stalin", "allahuakbar", "python",
        "pythonista", " " * (data.shape[1])
    ]
    testpred = words2ints(testpred)
    print testpred

    block = BiFwdAttSumDecoder(vocsize=vocsize,
                               outvocsize=vocsize,
                               encdim=encdim,
                               innerdim=statedim,
                               attdim=attdim)
    block.train([data, sdata], data).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg)\
         .validate_on([vdata, svdata], vdata).seq_accuracy().validinter(4)\
         .train(numbats=numbats, epochs=epochs)

    pred = block.predict(testpred, shiftdata(testpred))
    print ints2words(np.argmax(pred, axis=2))

    embed()
 def test_gloveglove(self):
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     g1 = Glove(50, 2000)
     g2 = Glove(50, 1000)
     gloveglove = g1.augment(g2)
     pred = gloveglove.predict([1000])
     self.assertTrue(np.allclose(g1 % 1000, pred[0, :50]))
     self.assertTrue(np.allclose(g2 % 1000, pred[0, 50:]))
     pred = gloveglove.predict([1001])
     self.assertTrue(np.allclose(g1 % 1001, pred[0, :50]))
     self.assertTrue(np.allclose(pred[0, 50:], np.zeros_like(pred[0, 50:])))
     gloveglove = g2.augment(g1)
     pred = gloveglove.predict([1, 2, 3, 4, 5, 50, 500, 1000])
     self.assertTrue(np.allclose(pred[:, :50], pred[:, 50:]))
Beispiel #12
0
    def setUp(self):
        wreg = 0.001
        epochs = 3
        numbats = 10
        lr = 0.1
        statedim = 70
        encdim = 70
        # get words
        numchars = 27
        embdim = 50
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        lm = Glove(embdim, 1000)
        words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
        data = words2ints(words)
        sdata = shiftdata(data)
        wordidxs = np.arange(0, len(words))
        numwords = wordidxs.shape[0]
        print "random seq neg log prob %.3f" % math.log(numchars**
                                                        data.shape[1])
        testneglogprob = 17
        print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % (
            testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1]))

        testpred = wordidxs[:15]
        testdata = data[:15]
        testsdata = sdata[:15]
        print testpred
        print testdata
        print testsdata
        #testpred = words2ints(testpred)
        block = idx2seq(encdim=encdim,
                        invocsize=numwords,
                        outvocsize=numchars,
                        innerdim=statedim,
                        seqlen=data.shape[1])
        print np.argmax(block.predict(testpred, testsdata), axis=2)
        self.block_before_training_frozen = block.freeze()
        block.train([wordidxs, sdata], data).seq_cross_entropy().grad_total_norm(0.5).adagrad(lr=lr).l2(wreg)\
             .autovalidate().seq_accuracy().validinter(5)\
             .train(numbats=numbats, epochs=epochs)
        self.block_after_training_frozen = block.freeze()
        pred = block.predict(testpred, testsdata)
Beispiel #13
0
def toglove(wordmat, worddic, dim=50):
    g = Glove(dim)
    gws = set(g.D.keys())
    wdws = set(worddic.keys())
    diff = wdws.difference(gws)
    # gather states about diff
    diffcounts = {worddic[k]: 0 for k in diff}
    total = 0
    moretal = 0
    for i in range(wordmat.shape[0]):
        for j in range(wordmat.shape[1]):
            if wordmat[i, j] >= 0:
                total += 1
                if wordmat[i, j] in diffcounts:
                    diffcounts[wordmat[i, j]] += 1
                    moretal += 1
    diffcounts = sorted(diffcounts.items(), key=lambda (k, v): v, reverse=True)
    print "%d words unknown by Glove of %d total words" % (moretal, total)
    revdic = {v: k for k, v in worddic.items()}
    d2g = lambda x: g * revdic[x] if x in revdic else x
    newdic = {k: d2g(v) for k, v in worddic.items()}
    newmat = np.vectorize(d2g)(wordmat)
    revgdic = {v: k for k, v in g.D.items()}
Beispiel #14
0
def run_RNNAutoEncoder(  # works after refactoring
        wreg=0.000001,
        epochs=50,
        numbats=20,
        lr=0.1,
        statedim=70,
        encdim=70):
    # get words
    vocsize = 27
    embdim = 50
    lm = Glove(embdim, 1000)
    words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    #wldf = pd.DataFrame(map(word2int, words)).fillna(0)
    #data = wldf.values.astype("int32")
    data = words2ints(words)
    sdata = shiftdata(data)
    embs = lm.W[map(lambda x: lm * x, words), :]
    print embs.shape, data.shape
    #embed()
    print "random seq neg log prob %.3f" % math.log(vocsize**data.shape[1])
    testneglogprob = 17
    print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % (
        testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1]))

    testpred = ["the", "alias", "mock", "test", "stalin"]
    testpred = words2ints(testpred)

    block = RNNAutoEncoder(vocsize=vocsize,
                           encdim=70,
                           innerdim=statedim,
                           seqlen=data.shape[1])
    block.train([data, sdata], data).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg)\
         .autovalidate().seq_accuracy().validinter(4)\
         .train(numbats=numbats, epochs=epochs)

    pred = block.predict(testpred, shiftdata(testpred))
    print ints2words(np.argmax(pred, axis=2))
Beispiel #15
0
 def __init__(self, indim=1000, outdim=50, trainfrac=0.0, **kw):
     super(WordEmbedGlove, self).__init__(indim, outdim, **kw)
     self.emb = Glove(outdim, vocabsize=indim, trainfrac=trainfrac).block
Beispiel #16
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\
            .adagrad(lr=0.1)\
            .train(numbats=50, epochs=50)

        print m.predict([num, num - 1, num - 2, num - 1],
                        [num, num - 1, num - 2, num - 2])

        mrr = 0.0
        recat10 = 0.0
        recat1 = 0.0
        tot = num + 1
        for a in range(tot):
            abc = zip(range(num + 1),
                      list(m.predict([a] * (num + 1), np.arange(0, num + 1))))
            abc = sorted(abc, key=lambda (x, y): y, reverse=True)
            #print abc[:10]
            for i in range(len(abc)):
                if abc[i][0] == a:
                    #print i
                    mrr += 1. / (1 + i)
                    if i < 10:
                        recat10 += 1
                    if i < 1:
                        recat1 += 1
                    break

        mrr /= tot
        recat10 /= tot
        recat1 /= tot
        print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
 def setUp(self):
     self.expshape = (4001, 50)
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     self.glove = Glove(self.expshape[1], self.expshape[0]-1, maskid=-1)
     print self.glove.defaultpath
 def setUp(self):
     wdic = {"the": 10, "a": 5, "his": 50, "abracadabrqmsd--qsdfmqgf-": 6}
     Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
     self.glove = Glove(50, 4000, maskid=-1).adapt(wdic)
     self.vanillaglove = Glove(50, 4000, maskid=-1)
Beispiel #19
0
def run(epochs=50,
        numbats=700,
        lr=1.,
        wreg=0.000001,
        bidir=False,
        layers=1,
        embdim=200,
        encdim=400,
        decdim=400,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        checkdata=False,
        predencode=False,
        closenegsam=False,
        glove=False,
        atleastcan=0,
        wordchar=False,
        charencmode="rnn",  # rnn or cnn
        totalrandomtest=False,
        rarewords=0,
        ):
    maskid = -1
    tt = ticktock("predpred")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar)

    if closenegsam:
        revsamplespace, revind = buildsamplespace(entmat, worddic)

    tt.tock("data loaded")
    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def pp(widxs):
            print " ".join([rwd[x] if x in rwd else "" for x in widxs])
        embed()

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1

    if rarewords > 0:
        rwd = {v: k for k, v in worddic.items()}
        print "doing rare words"
        trainwordcounts = getmatrixvaluecounts(traindata, entmat)
        stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True)
        fstwc = filter(lambda (x, y): y > rarewords, stwc)
        redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd],
                           range(1, len(fstwc)+1)))
        redwdic["<RARE>"] = 0
        #embed()
    if bidir:
        encdim = [encdim / 2] * layers
    else:
        encdim = [encdim] * layers

    # question-side model
    if glove:
        if rarewords > 0:
            raise Exception("glove with rare words currently not supported")
        wordemb = Glove(embdim).adapt(worddic)
    else:
        if rarewords > 0:
            wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic)
            #embed()
        else:
            wordemb = WordEmb(dim=embdim, worddic=worddic)
    if wordchar:
        print "wordchar model"
        numchars = 256
        if charencmode == "cnn":
            print "using CNN char encoder"
            charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2,
                                    maskid=maskid, stride=1)
            wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim,
                                    innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE)
            question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb,
                                           l2enc=wordenc, maskid=maskid)
        else:
            question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim,
                                           wordemb=wordemb, wordinnerdim=encdim, maskid=maskid,
                                           bidir=bidir)
    else:
        question_enc = SimpleSeq2Vec(inpemb=wordemb,
                                     inpembdim=wordemb.outdim,
                                     innerdim=encdim,
                                     maskid=maskid,
                                     bidir=bidir,
                                     layers=layers)

    # predicate-side model
    if predencode:
        predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)
                         )
        predemb.load(entmat)
        """
        predemb = SimpleSeq2Vec(inpemb=wordemb,
                                inpembdim=wordemb.outdim,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=bidir,
                                layers=layers)

        class PreProc(object):
            def __init__(self, entmat):
                self.f = PreProcE(entmat)

            def __call__(self, encdata, decgold):
                return (encdata, self.f(decgold)[0][0]), {}

        class PreProcE(object):
            def __init__(self, entmat):
                self.em = Val(entmat)

            def __call__(self, x):
                return (self.em[x],), {}

        transf = PreProc(entmat)
        predtransf = transf.f
        """
    else:
        predemb = VectorEmbed(numents, decdim)
        """transf = None
        predtransf = None"""

    # scoring
    scorer = MatchScore(question_enc, predemb, scorer=CosineDistance())

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            predrand = np.random.randint(self.min, self.max, gold.shape)
            return datas, predrand.astype("int32")

    class NegIdxGenClose(object):
        def __init__(self, revsamsp, rng):
            self.revsamsp = revsamsp
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            ret = np.zeros_like(gold)
            for i in range(gold.shape[0]):
                sampleset = self.revsamsp[gold[i]]
                if len(sampleset) > 5:
                    ret[i] = random.sample(sampleset, 1)[0]
                else:
                    ret[i] = np.random.randint(self.min, self.max)
            #embed()
            return datas, ret.astype("int32")


    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    if closenegsam:
        tt.msg("using close neg sampler")
        negidxgen = NegIdxGenClose(revsamplespace, numents)
    else:
        negidxgen = NegIdxGen(numents)

    checkembschange = True
    if checkembschange:
        #embed()
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        embvals = embvar.d.get_value()
    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingold]) \
                .negsamplegen(negidxgen) \
                .negrate(negrate) \
                .objective(obj) \
                .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
                .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")
    if checkembschange:
        embvar = wordemb.W
        if embvar is None:
            if hasattr(wordemb, "inner"):
                embvar = wordemb.inner.W
            else:
                raise Exception("no clue where to find embedding values")
        newembvals = embvar.d.get_value()
        embschanged = not np.allclose(embvals, newembvals)
        sumsqdiff = np.sum((newembvals - embvals)**2)
        print "Embeddings {}: {} sum of square diffs"\
            .format("changed" if embschanged else "did not change", sumsqdiff)

    # evaluation
    tt.tick("evaluating")
    qenc_pred = question_enc.predict(testdata)
    scores = []
    dontembed = True
    if atleastcan > 0:
        print "ensuring at least {} cans".format(atleastcan)
    if totalrandomtest:
        print "total randomness"
    for i in range(qenc_pred.shape[0]):
        if totalrandomtest:
            cans = [testgold[i]]
        else:
            cans = testsubjsrels[i][0] #+ testsubjsrels[i][1]
        if len(cans) < atleastcan:
            extracans = list(np.random.randint(0, numents, (atleastcan+50,)))
            extracans = list(set(extracans).difference(set(cans)))
            cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))]
            #print len(cans), cans
        if not dontembed:
            embed()
        #cans = set(cans)
        #if atleastcan > 0:
        #    while len(cans) < atleastcan:
        #        rancan = np.random.randint(0, numents)
        #        if rancan not in cans:
        #            cans.add(rancan)
        #cans = list(cans)
        if len(cans) == 0:
            scores.append([(-1, -np.infty)])
            continue
        #canembs = predemb.predict.transform(predtransf)(cans)
        canembs = predemb.predict(cans)
        scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i],
                                             canembs.shape[0], axis=0),
                                   canembs)
        scores.append(zip(cans, scoresi))
        if debug:
            embed()
        tt.progress(i, qenc_pred.shape[0], live=True)
    sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores]
    best = [sortedbesti[0][0] for sortedbesti in sortedbest]
    # Accuracy
    accuracy = np.sum(best == testgold) * 1. / testgold.shape[0]


    print("Accuracy: {}%".format(accuracy * 100))
Beispiel #20
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(
        len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, :len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(indim=len(chars),
                          inpembdim=embdim,
                          innerdim=encdim / 2 if bidir else encdim,
                          maskid=-1,
                          bidir=bidir)
    dist = CosineDistance() if cosine else EuclideanDistance()  #DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)
    '''
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    '''
    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\
        .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\
        .objective(obj).adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    cwenc.save("glove2c2w.block")
Beispiel #21
0
def run(
    negsammode="closest",  # "close" or "random"
    usetypes=True,
    mode="concat",  # "seq" or "concat" or "multi" or "multic" or "bino"
    glove=True,
    embdim=100,
    charencdim=100,
    charembdim=50,
    encdim=400,
    bidir=False,
    layers=1,
    charenc="rnn",  # "cnn" or "rnn"
    margin=0.5,
    lr=0.1,
    numbats=700,
    epochs=15,
    gradnorm=1.0,
    wreg=0.0001,
    loadmodel="no",
    debug=False,
    debugtest=False,
    forcesubjincl=False,
    randsameval=0,
    numtestcans=5,
    multiprune=-1,
    checkdata=False,
    testnegsam=False,
    testmodel=False,
    sepcharembs=False,
):
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    (subjmat, relmat), (subjdic, reldic), worddic, \
    subjinfo, (testsubjcans, relsperent) = readdata(debug=debug,
                                                    numtestcans=numtestcans if numtestcans > 0 else None)

    if usetypes:
        print "building type matrix"
        typmat = buildtypmat(subjmat, subjinfo, worddic)
        subjmat = np.concatenate([typmat, subjmat], axis=1)
        typlen = typmat.shape[1]

    relsamplespace = None
    subjsamplespace = None
    if negsammode == "closest" or negsammode == "close":
        relsamplespace, revind = buildrelsamplespace(relmat, worddic)
        subjsamplespace = loadsubjsamplespace()
    tt.tock("data loaded")

    if checkdata:
        embed()

    numwords = max(worddic.values()) + 1
    numsubjs = max(subjdic.values()) + 1
    numrels = max(reldic.values()) + 1
    maskid = -1
    numchars = 256

    nsrelsperent = relsperent if negsammode == "closest" else None

    if testnegsam:
        nig = NegIdxGen(numsubjs - 1,
                        numrels - 1,
                        relclose=relsamplespace,
                        subjclose=subjsamplespace,
                        relsperent=nsrelsperent)
        embed()

    if mode == "seq" or mode == "multi":
        decdim = encdim
    elif mode == "concat" or mode == "multic" or mode == "bino":
        decdim = encdim / 2
    else:
        raise Exception("unrecognized mode")

    print "{} mode: {} decdim".format(mode, decdim)

    # defining model
    if glove:
        wordemb = Glove(embdim).adapt(worddic)
    else:
        wordemb = WordEmb(dim=embdim, indim=numwords)

    charemb = VectorEmbed(indim=numchars, dim=charembdim)
    charemb2 = VectorEmbed(indim=numchars, dim=charembdim)
    if charenc == "cnn":
        print "using CNN char encoder"
        charenc = CNNSeqEncoder(inpemb=charemb,
                                innerdim=[charencdim] * 2,
                                maskid=maskid,
                                stride=1)
    elif charenc == "rnn":
        print "using RNN char encoder"
        charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \
            .maskoptions(maskid, MaskMode.AUTO)
    else:
        raise Exception("no other character encoding modes available")

    if bidir:
        encdim = encdim / 2

    if mode != "bino":
        if mode == "multi" or mode == "multic":
            wordenc = \
                SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim,
                                   innerdim=encdim, bidir=bidir, numouts=2, mode="seq")
        else:
            encdim = [encdim] * layers
            wordenc = RNNSeqEncoder(inpemb=False,
                                    inpembdim=wordemb.outdim + charencdim,
                                    innerdim=encdim,
                                    bidir=bidir).maskoptions(MaskMode.NONE)

        question_encoder = TwoLevelEncoder(l1enc=charenc,
                                           l2emb=wordemb,
                                           l2enc=wordenc,
                                           maskid=maskid)

    else:
        question_encoder = BinoEncoder(charenc=charenc,
                                       wordemb=wordemb,
                                       maskid=maskid,
                                       scadim=100,
                                       encdim=encdim / 2,
                                       bidir=bidir,
                                       enclayers=layers,
                                       outdim=decdim,
                                       scabidir=True)

    # encode predicate on word level
    predemb = SimpleSeq2Vec(inpemb=wordemb,
                            innerdim=decdim,
                            maskid=maskid,
                            bidir=False,
                            layers=1)

    #predemb.load(relmat)

    scharemb = charemb2 if sepcharembs else charemb
    if usetypes:
        # encode subj type on word level
        subjtypemb = SimpleSeq2Vec(inpemb=wordemb,
                                   innerdim=int(np.ceil(decdim * 1. / 2)),
                                   maskid=maskid,
                                   bidir=False,
                                   layers=1)
        # encode subject on character level
        charbidir = True
        charencinnerdim = int(np.floor(decdim * 1. / 2))
        charenclayers = 1
        if charbidir:
            charencinnerdim /= 2
            charenclayers = 2
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=charencinnerdim,
                                maskid=maskid,
                                bidir=charbidir,
                                layers=charenclayers)
        subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb)
    else:
        # encode subject on character level
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=False,
                                layers=1)
    #subjemb.load(subjmat)
    if testmodel:
        embed()
    # package
    if mode == "seq":
        lb = SeqLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "concat":
        lb = ConcatLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "multi" or mode == "multic":
        lb = MultiLeftBlock(question_encoder, mode)
        rb = RightBlock(subjemb, predemb)
    elif mode == "bino":
        lb = question_encoder
        rb = RightBlock(subjemb, predemb)
    else:
        raise Exception("unrecognized mode")
    scorer = SeqMatchScore(lb,
                           rb,
                           scorer=CosineDistance(),
                           aggregator=lambda x: x,
                           argproc=lambda x, y, z: ((x, ), (y, z)))

    obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    class PreProc(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, data, gold):  # gold: idxs-(batsize, 2)
            st = self.ef(gold[:, 0])[0][0]
            rt = self.rf(gold[:, 1])[0][0]
            return (data, st, rt), {}

    class PreProcE(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, x):
            subjslice = self.ef(x[:, 0])[0][0]
            relslice = self.rf(x[:, 1])[0][0]
            return (subjslice, relslice), {}

    class PreProcEnt(object):
        def __init__(self, mat):
            self.entmat = Val(mat)

        def __call__(self, x):
            return (self.entmat[x], ), {}

    transf = PreProc(subjmat, relmat)

    if debug:
        embed()

    if epochs > 0 and loadmodel == "no":
        tt.tick("training")
        saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)])
        print("CHECKPOINTING AS: {}".format(saveid))
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numsubjs-1, numrels-1,
                                    relclose=relsamplespace,
                                    subjclose=subjsamplespace,
                                    relsperent=nsrelsperent)) \
            .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \
            .validate_on([validdata, validgold]) \
            .autosavethis(scorer, "fullrank{}.model".format(saveid)) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained").tick()

        # saving
        #scorer.save("fullrank{}.model".format(saveid))
        print("SAVED AS: {}".format(saveid))

    if loadmodel is not "no":
        tt.tick("loading model")
        m = SeqMatchScore.load("fullrank{}.model".format(loadmodel))
        #embed()
        lb = m.l
        subjemb = m.r.subjenc
        predemb = m.r.predenc
        tt.tock("loaded model")

    # evaluation
    predictor = CustomPredictor(
        questionencoder=lb,
        entityencoder=subjemb,
        relationencoder=predemb,
        #mode=mode,
        enttrans=transf.ef,
        reltrans=transf.rf,
        debug=debugtest,
        subjinfo=subjinfo)

    tt.tick("predicting")
    if forcesubjincl:  # forces the intended subject entity to be among candidates
        for i in range(len(testsubjcans)):
            if testgold[i, 0] not in testsubjcans[i]:
                testsubjcans[i].append(testgold[i, 0])

    if randsameval > 0:  # generate random sampling eval data
        testsubjcans = np.random.randint(0, numsubjs,
                                         (testgold.shape[0], randsameval))
        testrelcans = np.random.randint(0, numrels,
                                        (testgold.shape[0], randsameval))
        testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1)
        testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1)
        testsubjcans = testsubjcans.tolist()
        testrelcans = testrelcans.tolist()
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relcans=testrelcans)
    else:
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relsperent=relsperent,
                                       multiprune=multiprune)
    tt.tock("predicted")
    tt.tick("evaluating")
    evalmat = prediction == testgold
    subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0]
    predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0]
    totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0]
    print "Test results ::::::::::::::::"
    print "Total Acc: \t {}".format(totalacc)
    print "Subj Acc: \t {}".format(subjacc)
    print "Pred Acc: \t {}".format(predacc)
    tt.tock("evaluated")

    def subjinspect(subjrank, gold):
        ret = [
            (("GOLD - " if gold == x else "       ") + subjinfo[x][0] + " (" +
             " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels",
             y) if x in subjinfo else (x, y) for x, y in subjrank
        ]
        return ret

    def inspectboth(hidecorrect=False, hidenotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            predx = testgold[i, 1]
            subjrank = predictor.subjranks[i]
            predrank = predictor.relranks[i]
            if hidecorrect and subjx == subjrank[0][0] and predrank[0][
                    0] == predx:
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue

    def inspectsubjs(hidecorrect=False,
                     hidenotincan=False,
                     shownotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            subjrank = predictor.subjranks[i]
            if subjx == subjrank[0][0] and hidecorrect:  # only look for errors
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue
            if shownotincan and subjx in [k for k, v in subjrank]:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(
                    testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format(
                        *([
                            subjinfo[subjx][0], subjinfo[subjx][1],
                            subjinfo[subjx][3], subjinfo[subjx][2]
                        ] if subjx in
                          subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"])))
            inspres = subjinspect(subjrank, subjx)
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    def inspectpreds(hidecorrect=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.relranks)):
            relx = testgold[i, 1]
            subjx = testgold[i, 0]
            relrank = predictor.relranks[i]
            if relx == relrank[0][0] and hidecorrect:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(testdata[i, :, 0], rwd),
                wordids2string(relmat[relx, :], rwd))
            inspres = [(("GOLD - " if relx == x else "        ") +
                        wordids2string(relmat[x], rwd), y) for x, y in relrank]
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    embed()
Beispiel #22
0
def getdic2glove(worddic, dim=50):
    g = Glove(dim)
    revdic = {v: k for k, v in worddic.items()}
    d2g = lambda x: g * revdic[x] if x in revdic else x
    newdic = {k: d2g(v) for k, v in worddic.items()}
    return d2g, newdic
Beispiel #23
0
        return subjinf

    if cachep is not None:
        if os.path.isfile(cachep):  # load
            tt.tick("loading cached subject info")
            subjinfo = pickle.load(open(cachep))
            tt.tock("loaded cached subject info")
        else:  # make  and dump
            subjinfo = make()
            tt.tick("dumping subject info in cache")
            pickle.dump(subjinfo, open(cachep, "w"))
            tt.tock("dumped subject info in cache")
    else:  # just make
        subjinfo = make()
    return subjinfo


if __name__ == "__main__":
    x = np.random.randint(0, 50, (5, 4, 3))
    x = np.concatenate([np.random.randint(0, 1000, (5, 4, 1)), x], axis=2)
    x = np.concatenate([x, np.zeros_like(x)], axis=1)
    print x, x.shape
    m = WordCharSentEnc(numchars=50,
                        charembdim=10,
                        charinnerdim=20,
                        wordemb=Glove(50, 1000),
                        wordinnerdim=3,
                        maskid=0,
                        returnall=True)
    pred = m.predict(x)
    print pred, pred.shape