def setUp(self): words = "the a his monkey inception key earlgrey" wdic = dict(zip(words.split(), range(1, len(words.split()) + 1))) self.baseemb = WordEmb(dim=50, worddic=wdic, maskid=-1) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(50, 4000) self.emb = self.baseemb.override(self.glove)
def do_custom_emb(inpemb, outemb, awc, embdim): thresh = 10 sawc = sorted(awc.items(), key=lambda (k, v): v, reverse=True) rarewords = {k for (k, v) in sawc if v < thresh} g = Glove(embdim) inpemb = inpemb.override(g) outemb = outemb.override(g, which=rarewords) return inpemb, outemb
def run_seq2idx( # works after refactoring (with adagrad) wreg=0.0, epochs=75, numbats=20, lr=1., statedim=100, ): # get words numchars = 27 embdim = 50 lm = Glove(embdim, 1000) words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) #wldf = pd.DataFrame(map(word2int, words)).fillna(0) #data = wldf.values.astype("int32") data = words2ints(words) #embs = lm.W[map(lambda x: lm * x, words), :] #embed() wordidxs = np.arange(0, len(words)) print wordidxs[:5] print data[:5] print words[:5] numwords = wordidxs.shape[0] print "random seq neg log prob %.3f" % math.log(numchars**data.shape[1]) testneglogprob = 17 print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % ( testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1])) testpred = ["the", "of", "to", "their", "in"] testpred = words2ints(testpred) print words[:5], words[35] ####testpred = np.eye(numchars, numchars)[testpred, :] wordidxsonehot = np.eye(numwords, numwords)[wordidxs, :] ####data = np.eye(numchars, numchars)[data, :] block = seq2idx(invocsize=numchars, outvocsize=numwords, innerdim=statedim) '''gru = GRU(innerdim=statedim, dim=numchars) lin = Lin(indim=statedim, dim=numwords) lin2 = Lin(indim=numwords, dim=numwords) block = asblock(lambda x: Softmax()(lin(gru(x)[:, -1, :])))''' ###block = asblock(lambda x: Softmax()(lin2(x))) ''' print testpred probepred = np.argmax(block.predict(testpred), axis=1) print probepred for p in block.output.allparams: print p ''' block.train([data], wordidxs).cross_entropy().adagrad(lr=lr).autovalidate().accuracy().validinter(5)\ .train(numbats=numbats, epochs=epochs) #embed() pred = block.predict(testpred) print pred.shape print np.argmax(pred, axis=1)
def __init__(self, indim=4000, outdim=100, embdim=50, embtrainfrac=0.0, **kw): super(WordEmbedPlusGlove, self).__init__(indim, outdim + embdim, **kw) self.glove = Glove(embdim, vocabsize=indim, trainfrac=embtrainfrac).block self.emb = VectorEmbed(indim=indim, dim=outdim)
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) vdata = np.arange(num) negrate = 5 def obj(p, n): return n - p m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\ .adagrad(lr=0.1).objective(obj) \ .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \ .train(numbats=50, epochs=29, returnerrors=True) #.writeresultstofile("testingresultswriter.tsv") \ tdata = np.arange(num) tt = ticktock("eval") tt.tick() mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata) tt.tock("evaluated test data") print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9) print verr self.assertTrue( np.allclose(np.asarray([mrr, recat1, recat10]), np.asarray(verr[-1][1:])))
class TestGloveOverriding(TestCase): def setUp(self): words = "the a his monkey inception key earlgrey" wdic = dict(zip(words.split(), range(1, len(words.split()) + 1))) self.baseemb = WordEmb(dim=50, worddic=wdic, maskid=-1) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(50, 4000) self.emb = self.baseemb.override(self.glove) def test_embed_masker(self): v = Val(np.random.randint(-1, 5, (4, 3))) m = self.emb(v).mask self.assertTrue(np.all((v.v != -1) == m.v)) def test_sameasglove(self): words = "key the a his" pred = self.emb.predict([self.emb * x for x in words.split()]) gpred = self.glove.predict([self.glove * x for x in words.split()]) self.assertTrue(np.allclose(pred, gpred)) def test_sameasbase(self): words = "inception monkey earlgrey" pred = self.emb.predict([self.emb * x for x in words.split()]) gpred = self.baseemb.predict([self.baseemb * x for x in words.split()]) self.assertTrue(np.allclose(pred, gpred)) def test_notasglove(self): words = "inception monkey earlgrey" pred = self.emb.predict([self.emb * x for x in words.split()]) gpred = self.glove.predict([self.glove * x for x in words.split()]) self.assertFalse(np.allclose(pred, gpred)) def test_notasbase(self): words = "key the a his" pred = self.emb.predict([self.emb * x for x in words.split()]) gpred = self.baseemb.predict([self.baseemb * x for x in words.split()]) self.assertFalse(np.allclose(pred, gpred))
def __init__(self, numchars=220, numwords=4e5, encdim=100, embdim=50, embtrainfrac=0.0, maskid=0, glovepath=None, **kw): super(WordEncoderPlusGlove, self).__init__(**kw) self.glove = Glove(embdim, vocabsize=numwords, trainfrac=embtrainfrac, path=glovepath).block self.enc = WordEncoder(indim=numchars, outdim=encdim, maskid=maskid)
def run_seqdecatt( # seems to work wreg=0.00001, epochs=50, numbats=50, lr=0.1, statedim=50, encdim=50, attdim=50, numwords=5000, ): # get words vocsize = 28 lm = Glove(50, numwords) allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) #embed() invwords = [word[::-1] for word in allwords] data = words2ints(allwords) idata = words2ints(invwords) startsym = 0 golddata = data #golddata = idata print data[:10] print shiftdata(data, startsym)[:10] testwords = [ "the", "alias", "mock", "test", "stalin", "allahuakbar", "python", "pythonista" ] testpred = words2ints(testwords) block = SimpleSeqEncDecAtt(inpvocsize=vocsize, outvocsize=vocsize, encdim=encdim, decdim=statedim, attdim=attdim, inconcat=False, bidir=False, statetrans=None) block.train([data, shiftdata(golddata, startsym)], golddata).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg) \ .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(2) \ .train(numbats=numbats, epochs=epochs) s = SeqEncDecSearch(block) pred, probs = s.decode(testpred, startsym, testpred.shape[1]) print ints2words(pred), probs
def run_idx2seq( # works after refactor wreg=0.000001, epochs=150, numbats=10, lr=0.1, statedim=70, encdim=100, ): # get words numchars = 27 embdim = 50 lm = Glove(embdim, 1000) words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) data = words2ints(words) sdata = shiftdata(data) wordidxs = np.arange(0, len(words)) numwords = wordidxs.shape[0] print "random seq neg log prob %.3f" % math.log(numchars**data.shape[1]) testneglogprob = 17 print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % ( testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1])) testpred = wordidxs[:15] testdata = data[:15] testsdata = sdata[:15] print testpred print testdata print testsdata #testpred = words2ints(testpred) block = idx2seq(encdim=encdim, invocsize=numwords, outvocsize=numchars, innerdim=statedim, seqlen=data.shape[1]) print np.argmax(block.predict(testpred, testsdata), axis=2) print block.output.allparams block.train([wordidxs, sdata], data).seq_cross_entropy().grad_total_norm(0.5).adagrad(lr=lr).l2(wreg)\ .autovalidate().seq_accuracy().validinter(5)\ .train(numbats=numbats, epochs=epochs) pred = block.predict(testpred, testsdata) for word in ints2words(np.argmax(pred, axis=2)): print word embed()
def run_attentionseqdecoder( # seems to work wreg=0.00001, # TODO: regularization other than 0.0001 first stagnates, then goes down epochs=100, numbats=20, lr=0.1, statedim=50, encdim=50, attdim=50): # get words vocsize = 27 embdim = 50 lm = Glove(embdim, 2000) allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) words = allwords[:1000] vwords = allwords[1000:] data = words2ints(words) sdata = shiftdata(data) vdata = words2ints(vwords) svdata = shiftdata(vdata) print "random seq neg log prob %.3f" % math.log(vocsize**data.shape[1]) testneglogprob = 17 print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % ( testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1])) testpred = [ "the", "alias", "mock", "test", "stalin", "allahuakbar", "python", "pythonista", " " * (data.shape[1]) ] testpred = words2ints(testpred) print testpred block = BiFwdAttSumDecoder(vocsize=vocsize, outvocsize=vocsize, encdim=encdim, innerdim=statedim, attdim=attdim) block.train([data, sdata], data).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg)\ .validate_on([vdata, svdata], vdata).seq_accuracy().validinter(4)\ .train(numbats=numbats, epochs=epochs) pred = block.predict(testpred, shiftdata(testpred)) print ints2words(np.argmax(pred, axis=2)) embed()
def test_gloveglove(self): Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" g1 = Glove(50, 2000) g2 = Glove(50, 1000) gloveglove = g1.augment(g2) pred = gloveglove.predict([1000]) self.assertTrue(np.allclose(g1 % 1000, pred[0, :50])) self.assertTrue(np.allclose(g2 % 1000, pred[0, 50:])) pred = gloveglove.predict([1001]) self.assertTrue(np.allclose(g1 % 1001, pred[0, :50])) self.assertTrue(np.allclose(pred[0, 50:], np.zeros_like(pred[0, 50:]))) gloveglove = g2.augment(g1) pred = gloveglove.predict([1, 2, 3, 4, 5, 50, 500, 1000]) self.assertTrue(np.allclose(pred[:, :50], pred[:, 50:]))
def setUp(self): wreg = 0.001 epochs = 3 numbats = 10 lr = 0.1 statedim = 70 encdim = 70 # get words numchars = 27 embdim = 50 Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" lm = Glove(embdim, 1000) words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) data = words2ints(words) sdata = shiftdata(data) wordidxs = np.arange(0, len(words)) numwords = wordidxs.shape[0] print "random seq neg log prob %.3f" % math.log(numchars** data.shape[1]) testneglogprob = 17 print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % ( testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1])) testpred = wordidxs[:15] testdata = data[:15] testsdata = sdata[:15] print testpred print testdata print testsdata #testpred = words2ints(testpred) block = idx2seq(encdim=encdim, invocsize=numwords, outvocsize=numchars, innerdim=statedim, seqlen=data.shape[1]) print np.argmax(block.predict(testpred, testsdata), axis=2) self.block_before_training_frozen = block.freeze() block.train([wordidxs, sdata], data).seq_cross_entropy().grad_total_norm(0.5).adagrad(lr=lr).l2(wreg)\ .autovalidate().seq_accuracy().validinter(5)\ .train(numbats=numbats, epochs=epochs) self.block_after_training_frozen = block.freeze() pred = block.predict(testpred, testsdata)
def toglove(wordmat, worddic, dim=50): g = Glove(dim) gws = set(g.D.keys()) wdws = set(worddic.keys()) diff = wdws.difference(gws) # gather states about diff diffcounts = {worddic[k]: 0 for k in diff} total = 0 moretal = 0 for i in range(wordmat.shape[0]): for j in range(wordmat.shape[1]): if wordmat[i, j] >= 0: total += 1 if wordmat[i, j] in diffcounts: diffcounts[wordmat[i, j]] += 1 moretal += 1 diffcounts = sorted(diffcounts.items(), key=lambda (k, v): v, reverse=True) print "%d words unknown by Glove of %d total words" % (moretal, total) revdic = {v: k for k, v in worddic.items()} d2g = lambda x: g * revdic[x] if x in revdic else x newdic = {k: d2g(v) for k, v in worddic.items()} newmat = np.vectorize(d2g)(wordmat) revgdic = {v: k for k, v in g.D.items()}
def run_RNNAutoEncoder( # works after refactoring wreg=0.000001, epochs=50, numbats=20, lr=0.1, statedim=70, encdim=70): # get words vocsize = 27 embdim = 50 lm = Glove(embdim, 1000) words = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys()) #wldf = pd.DataFrame(map(word2int, words)).fillna(0) #data = wldf.values.astype("int32") data = words2ints(words) sdata = shiftdata(data) embs = lm.W[map(lambda x: lm * x, words), :] print embs.shape, data.shape #embed() print "random seq neg log prob %.3f" % math.log(vocsize**data.shape[1]) testneglogprob = 17 print "%.2f neg log prob for a whole sequence is %.3f prob per slot" % ( testneglogprob, math.exp(-testneglogprob * 1. / data.shape[1])) testpred = ["the", "alias", "mock", "test", "stalin"] testpred = words2ints(testpred) block = RNNAutoEncoder(vocsize=vocsize, encdim=70, innerdim=statedim, seqlen=data.shape[1]) block.train([data, sdata], data).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg)\ .autovalidate().seq_accuracy().validinter(4)\ .train(numbats=numbats, epochs=epochs) pred = block.predict(testpred, shiftdata(testpred)) print ints2words(np.argmax(pred, axis=2))
def __init__(self, indim=1000, outdim=50, trainfrac=0.0, **kw): super(WordEmbedGlove, self).__init__(indim, outdim, **kw) self.emb = Glove(outdim, vocabsize=indim, trainfrac=trainfrac).block
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\ .adagrad(lr=0.1)\ .train(numbats=50, epochs=50) print m.predict([num, num - 1, num - 2, num - 1], [num, num - 1, num - 2, num - 2]) mrr = 0.0 recat10 = 0.0 recat1 = 0.0 tot = num + 1 for a in range(tot): abc = zip(range(num + 1), list(m.predict([a] * (num + 1), np.arange(0, num + 1)))) abc = sorted(abc, key=lambda (x, y): y, reverse=True) #print abc[:10] for i in range(len(abc)): if abc[i][0] == a: #print i mrr += 1. / (1 + i) if i < 10: recat10 += 1 if i < 1: recat1 += 1 break mrr /= tot recat10 /= tot recat1 /= tot print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9)
def setUp(self): self.expshape = (4001, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]-1, maskid=-1) print self.glove.defaultpath
def setUp(self): wdic = {"the": 10, "a": 5, "his": 50, "abracadabrqmsd--qsdfmqgf-": 6} Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(50, 4000, maskid=-1).adapt(wdic) self.vanillaglove = Glove(50, 4000, maskid=-1)
def run(epochs=50, numbats=700, lr=1., wreg=0.000001, bidir=False, layers=1, embdim=200, encdim=400, decdim=400, negrate=1, margin=1., hingeloss=False, debug=False, checkdata=False, predencode=False, closenegsam=False, glove=False, atleastcan=0, wordchar=False, charencmode="rnn", # rnn or cnn totalrandomtest=False, rarewords=0, ): maskid = -1 tt = ticktock("predpred") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar) if closenegsam: revsamplespace, revind = buildsamplespace(entmat, worddic) tt.tock("data loaded") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def pp(widxs): print " ".join([rwd[x] if x in rwd else "" for x in widxs]) embed() numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 if rarewords > 0: rwd = {v: k for k, v in worddic.items()} print "doing rare words" trainwordcounts = getmatrixvaluecounts(traindata, entmat) stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True) fstwc = filter(lambda (x, y): y > rarewords, stwc) redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd], range(1, len(fstwc)+1))) redwdic["<RARE>"] = 0 #embed() if bidir: encdim = [encdim / 2] * layers else: encdim = [encdim] * layers # question-side model if glove: if rarewords > 0: raise Exception("glove with rare words currently not supported") wordemb = Glove(embdim).adapt(worddic) else: if rarewords > 0: wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic) #embed() else: wordemb = WordEmb(dim=embdim, worddic=worddic) if wordchar: print "wordchar model" numchars = 256 if charencmode == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2, maskid=maskid, stride=1) wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim, wordemb=wordemb, wordinnerdim=encdim, maskid=maskid, bidir=bidir) else: question_enc = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=layers) # predicate-side model if predencode: predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) ) predemb.load(entmat) """ predemb = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decgold): return (encdata, self.f(decgold)[0][0]), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return (self.em[x],), {} transf = PreProc(entmat) predtransf = transf.f """ else: predemb = VectorEmbed(numents, decdim) """transf = None predtransf = None""" # scoring scorer = MatchScore(question_enc, predemb, scorer=CosineDistance()) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, gold.shape) return datas, predrand.astype("int32") class NegIdxGenClose(object): def __init__(self, revsamsp, rng): self.revsamsp = revsamsp self.min = 0 self.max = rng def __call__(self, datas, gold): ret = np.zeros_like(gold) for i in range(gold.shape[0]): sampleset = self.revsamsp[gold[i]] if len(sampleset) > 5: ret[i] = random.sample(sampleset, 1)[0] else: ret[i] = np.random.randint(self.min, self.max) #embed() return datas, ret.astype("int32") if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p if closenegsam: tt.msg("using close neg sampler") negidxgen = NegIdxGenClose(revsamplespace, numents) else: negidxgen = NegIdxGen(numents) checkembschange = True if checkembschange: #embed() embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") embvals = embvar.d.get_value() tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(negidxgen) \ .negrate(negrate) \ .objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) tt.tock("trained") if checkembschange: embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") newembvals = embvar.d.get_value() embschanged = not np.allclose(embvals, newembvals) sumsqdiff = np.sum((newembvals - embvals)**2) print "Embeddings {}: {} sum of square diffs"\ .format("changed" if embschanged else "did not change", sumsqdiff) # evaluation tt.tick("evaluating") qenc_pred = question_enc.predict(testdata) scores = [] dontembed = True if atleastcan > 0: print "ensuring at least {} cans".format(atleastcan) if totalrandomtest: print "total randomness" for i in range(qenc_pred.shape[0]): if totalrandomtest: cans = [testgold[i]] else: cans = testsubjsrels[i][0] #+ testsubjsrels[i][1] if len(cans) < atleastcan: extracans = list(np.random.randint(0, numents, (atleastcan+50,))) extracans = list(set(extracans).difference(set(cans))) cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))] #print len(cans), cans if not dontembed: embed() #cans = set(cans) #if atleastcan > 0: # while len(cans) < atleastcan: # rancan = np.random.randint(0, numents) # if rancan not in cans: # cans.add(rancan) #cans = list(cans) if len(cans) == 0: scores.append([(-1, -np.infty)]) continue #canembs = predemb.predict.transform(predtransf)(cans) canembs = predemb.predict(cans) scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i], canembs.shape[0], axis=0), canembs) scores.append(zip(cans, scoresi)) if debug: embed() tt.progress(i, qenc_pred.shape[0], live=True) sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores] best = [sortedbesti[0][0] for sortedbesti in sortedbest] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] print("Accuracy: {}%".format(accuracy * 100))
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1., cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format( len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, :len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec(indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir) dist = CosineDistance() if cosine else EuclideanDistance() #DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) ''' scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() ''' class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\ .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\ .objective(obj).adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) cwenc.save("glove2c2w.block")
def run( negsammode="closest", # "close" or "random" usetypes=True, mode="concat", # "seq" or "concat" or "multi" or "multic" or "bino" glove=True, embdim=100, charencdim=100, charembdim=50, encdim=400, bidir=False, layers=1, charenc="rnn", # "cnn" or "rnn" margin=0.5, lr=0.1, numbats=700, epochs=15, gradnorm=1.0, wreg=0.0001, loadmodel="no", debug=False, debugtest=False, forcesubjincl=False, randsameval=0, numtestcans=5, multiprune=-1, checkdata=False, testnegsam=False, testmodel=False, sepcharembs=False, ): tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ (subjmat, relmat), (subjdic, reldic), worddic, \ subjinfo, (testsubjcans, relsperent) = readdata(debug=debug, numtestcans=numtestcans if numtestcans > 0 else None) if usetypes: print "building type matrix" typmat = buildtypmat(subjmat, subjinfo, worddic) subjmat = np.concatenate([typmat, subjmat], axis=1) typlen = typmat.shape[1] relsamplespace = None subjsamplespace = None if negsammode == "closest" or negsammode == "close": relsamplespace, revind = buildrelsamplespace(relmat, worddic) subjsamplespace = loadsubjsamplespace() tt.tock("data loaded") if checkdata: embed() numwords = max(worddic.values()) + 1 numsubjs = max(subjdic.values()) + 1 numrels = max(reldic.values()) + 1 maskid = -1 numchars = 256 nsrelsperent = relsperent if negsammode == "closest" else None if testnegsam: nig = NegIdxGen(numsubjs - 1, numrels - 1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent) embed() if mode == "seq" or mode == "multi": decdim = encdim elif mode == "concat" or mode == "multic" or mode == "bino": decdim = encdim / 2 else: raise Exception("unrecognized mode") print "{} mode: {} decdim".format(mode, decdim) # defining model if glove: wordemb = Glove(embdim).adapt(worddic) else: wordemb = WordEmb(dim=embdim, indim=numwords) charemb = VectorEmbed(indim=numchars, dim=charembdim) charemb2 = VectorEmbed(indim=numchars, dim=charembdim) if charenc == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(inpemb=charemb, innerdim=[charencdim] * 2, maskid=maskid, stride=1) elif charenc == "rnn": print "using RNN char encoder" charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \ .maskoptions(maskid, MaskMode.AUTO) else: raise Exception("no other character encoding modes available") if bidir: encdim = encdim / 2 if mode != "bino": if mode == "multi" or mode == "multic": wordenc = \ SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir, numouts=2, mode="seq") else: encdim = [encdim] * layers wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_encoder = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_encoder = BinoEncoder(charenc=charenc, wordemb=wordemb, maskid=maskid, scadim=100, encdim=encdim / 2, bidir=bidir, enclayers=layers, outdim=decdim, scabidir=True) # encode predicate on word level predemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #predemb.load(relmat) scharemb = charemb2 if sepcharembs else charemb if usetypes: # encode subj type on word level subjtypemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=int(np.ceil(decdim * 1. / 2)), maskid=maskid, bidir=False, layers=1) # encode subject on character level charbidir = True charencinnerdim = int(np.floor(decdim * 1. / 2)) charenclayers = 1 if charbidir: charencinnerdim /= 2 charenclayers = 2 subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=charencinnerdim, maskid=maskid, bidir=charbidir, layers=charenclayers) subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb) else: # encode subject on character level subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #subjemb.load(subjmat) if testmodel: embed() # package if mode == "seq": lb = SeqLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "concat": lb = ConcatLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "multi" or mode == "multic": lb = MultiLeftBlock(question_encoder, mode) rb = RightBlock(subjemb, predemb) elif mode == "bino": lb = question_encoder rb = RightBlock(subjemb, predemb) else: raise Exception("unrecognized mode") scorer = SeqMatchScore(lb, rb, scorer=CosineDistance(), aggregator=lambda x: x, argproc=lambda x, y, z: ((x, ), (y, z))) obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) class PreProc(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, data, gold): # gold: idxs-(batsize, 2) st = self.ef(gold[:, 0])[0][0] rt = self.rf(gold[:, 1])[0][0] return (data, st, rt), {} class PreProcE(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, x): subjslice = self.ef(x[:, 0])[0][0] relslice = self.rf(x[:, 1])[0][0] return (subjslice, relslice), {} class PreProcEnt(object): def __init__(self, mat): self.entmat = Val(mat) def __call__(self, x): return (self.entmat[x], ), {} transf = PreProc(subjmat, relmat) if debug: embed() if epochs > 0 and loadmodel == "no": tt.tick("training") saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)]) print("CHECKPOINTING AS: {}".format(saveid)) nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numsubjs-1, numrels-1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent)) \ .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \ .validate_on([validdata, validgold]) \ .autosavethis(scorer, "fullrank{}.model".format(saveid)) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained").tick() # saving #scorer.save("fullrank{}.model".format(saveid)) print("SAVED AS: {}".format(saveid)) if loadmodel is not "no": tt.tick("loading model") m = SeqMatchScore.load("fullrank{}.model".format(loadmodel)) #embed() lb = m.l subjemb = m.r.subjenc predemb = m.r.predenc tt.tock("loaded model") # evaluation predictor = CustomPredictor( questionencoder=lb, entityencoder=subjemb, relationencoder=predemb, #mode=mode, enttrans=transf.ef, reltrans=transf.rf, debug=debugtest, subjinfo=subjinfo) tt.tick("predicting") if forcesubjincl: # forces the intended subject entity to be among candidates for i in range(len(testsubjcans)): if testgold[i, 0] not in testsubjcans[i]: testsubjcans[i].append(testgold[i, 0]) if randsameval > 0: # generate random sampling eval data testsubjcans = np.random.randint(0, numsubjs, (testgold.shape[0], randsameval)) testrelcans = np.random.randint(0, numrels, (testgold.shape[0], randsameval)) testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1) testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1) testsubjcans = testsubjcans.tolist() testrelcans = testrelcans.tolist() prediction = predictor.predict(testdata, entcans=testsubjcans, relcans=testrelcans) else: prediction = predictor.predict(testdata, entcans=testsubjcans, relsperent=relsperent, multiprune=multiprune) tt.tock("predicted") tt.tick("evaluating") evalmat = prediction == testgold subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0] predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0] totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0] print "Test results ::::::::::::::::" print "Total Acc: \t {}".format(totalacc) print "Subj Acc: \t {}".format(subjacc) print "Pred Acc: \t {}".format(predacc) tt.tock("evaluated") def subjinspect(subjrank, gold): ret = [ (("GOLD - " if gold == x else " ") + subjinfo[x][0] + " (" + " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels", y) if x in subjinfo else (x, y) for x, y in subjrank ] return ret def inspectboth(hidecorrect=False, hidenotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] predx = testgold[i, 1] subjrank = predictor.subjranks[i] predrank = predictor.relranks[i] if hidecorrect and subjx == subjrank[0][0] and predrank[0][ 0] == predx: continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue def inspectsubjs(hidecorrect=False, hidenotincan=False, shownotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] subjrank = predictor.subjranks[i] if subjx == subjrank[0][0] and hidecorrect: # only look for errors continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue if shownotincan and subjx in [k for k, v in subjrank]: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string( testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format( *([ subjinfo[subjx][0], subjinfo[subjx][1], subjinfo[subjx][3], subjinfo[subjx][2] ] if subjx in subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"]))) inspres = subjinspect(subjrank, subjx) i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() def inspectpreds(hidecorrect=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.relranks)): relx = testgold[i, 1] subjx = testgold[i, 0] relrank = predictor.relranks[i] if relx == relrank[0][0] and hidecorrect: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string(testdata[i, :, 0], rwd), wordids2string(relmat[relx, :], rwd)) inspres = [(("GOLD - " if relx == x else " ") + wordids2string(relmat[x], rwd), y) for x, y in relrank] i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() embed()
def getdic2glove(worddic, dim=50): g = Glove(dim) revdic = {v: k for k, v in worddic.items()} d2g = lambda x: g * revdic[x] if x in revdic else x newdic = {k: d2g(v) for k, v in worddic.items()} return d2g, newdic
return subjinf if cachep is not None: if os.path.isfile(cachep): # load tt.tick("loading cached subject info") subjinfo = pickle.load(open(cachep)) tt.tock("loaded cached subject info") else: # make and dump subjinfo = make() tt.tick("dumping subject info in cache") pickle.dump(subjinfo, open(cachep, "w")) tt.tock("dumped subject info in cache") else: # just make subjinfo = make() return subjinfo if __name__ == "__main__": x = np.random.randint(0, 50, (5, 4, 3)) x = np.concatenate([np.random.randint(0, 1000, (5, 4, 1)), x], axis=2) x = np.concatenate([x, np.zeros_like(x)], axis=1) print x, x.shape m = WordCharSentEnc(numchars=50, charembdim=10, charinnerdim=20, wordemb=Glove(50, 1000), wordinnerdim=3, maskid=0, returnall=True) pred = m.predict(x) print pred, pred.shape