def setUp(self): enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20) x = np.random.randint(0, 100, (33, 5)) o = enc.autobuild(x) self.o = o[1][0] m = MatchScore(enc, enc) mo = m.autobuild(x, x) self.mo = mo[1][0]
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0]+1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num+1]) self.assertRaises(Exception, self.cemb.predict, [num+1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num+1) # glove against glove self.assertTrue(np.allclose(mg.predict([num, 100], [num, 100]), [np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\ .adagrad(lr=0.1)\ .train(numbats=50, epochs=50) print m.predict([num, num-1, num-2, num-1], [num, num-1, num-2, num-2]) mrr = 0.0 recat10 = 0.0 recat1 = 0.0 tot = num + 1 for a in range(tot): abc = zip(range(num+1), list(m.predict([a]*(num+1), np.arange(0, num+1)))) abc = sorted(abc, key=lambda (x, y): y, reverse=True) #print abc[:10] for i in range(len(abc)): if abc[i][0] == a: #print i mrr += 1./(1+i) if i < 10: recat10 += 1 if i < 1: recat1 += 1 break mrr /= tot recat10 /= tot recat1 /= tot print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9)
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) vdata = np.arange(num) negrate = 5 def obj(p, n): return n - p m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\ .adagrad(lr=0.1).objective(obj) \ .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \ .train(numbats=50, epochs=29, returnerrors=True) #.writeresultstofile("testingresultswriter.tsv") \ tdata = np.arange(num) tt = ticktock("eval") tt.tick() mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata) tt.tock("evaluated test data") print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9) print verr self.assertTrue( np.allclose(np.asarray([mrr, recat1, recat10]), np.asarray(verr[-1][1:])))
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10) ]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1.0, cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, : len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec( indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir ) dist = CosineDistance() if cosine else EuclideanDistance() # DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) """ scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() """ class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = ( scorer.nstrain([charwordmat, np.arange(len(words) + 1)]) .negsamplegen(NegIdxGen(len(words))) .negrate(negrate) .objective(obj) .adagrad(lr=lr) .l2(wreg) .train(numbats=numbats, epochs=epochs) ) cwenc.save("glove2c2w.block")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim/2]*layers else: encinnerdim = [innerdim]*layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim/2]*memlayers else: innerdim = [innerdim]*memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings hingeloss = True numbats = 10 lr = 0.02 epochs = 1 printpreds = True preeval = True # specemb = 100 margin = 1. evalsplits = 1 # usetypes=True mode = "charword" # checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic \ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") # transform for predpred traingold = traingold[:, 1] - relstarts validgold = validgold[:, 1] - relstarts testgold = testgold[:, 1] - relstarts if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform") inpenc = SimpleSeq2Vec(inpemb=emb, inpembdim=emb.outdim, innerdim=encinnerdim, maskid=maskid, bidir=bidir, layers=layers) dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} scorer = MatchScore(inpenc, predemb, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, (gold.shape[0], )) return datas, predrand.astype("int32") # embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(NegIdxGen(numents - relstarts))\ .negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") # eval canids = np.arange(start=0, stop=numents - relstarts) predembs = predemb.predict(canids) # (numrels, embdim) tt.tick("evaluating") predencs = inpenc.predict(testdata) # (batsize, embdim) scores = np.zeros((predencs.shape[0], predembs.shape[0])) for i in range(predencs.shape[0]): scores[i, :] = \ scorer.s.predict(np.repeat(predencs[np.newaxis, i], predembs.shape[0], axis=0), predembs) tt.progress(i, predencs.shape[0], live=True) best = np.argmax(scores, axis=1) sortedbest = [ sorted(zip(np.arange(scores.shape[1]), list(scores[i])), reverse=True, key=lambda (x, y): y) for i in range(scores.shape[0]) ] sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest], dtype="int32") # MRR mrr = 0.0 for i in range(sortedbestmat.shape[1]): mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1) mrr /= testgold.shape[0] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] # R@X def ratx(ratnum): return rat(ratnum, sortedbestmat, testgold) def rat(ratnum, sortedpred, gold): acc = 0.0 for i in range(min(ratnum, sortedbestmat.shape[1])): acc += 1.0 * np.sum(sortedpred[:, i] == gold) acc /= testgold.shape[0] return acc print "Accuracy: {}%".format(accuracy * 100) print "MRR: {}".format(mrr) print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format( ratx(10) * 100, ratx(50) * 100, ratx(100) * 100) embed() tt.tock("evaluated")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1., cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format( len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, :len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec(indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir) dist = CosineDistance() if cosine else EuclideanDistance() #DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) ''' scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() ''' class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\ .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\ .objective(obj).adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) cwenc.save("glove2c2w.block")
def run(epochs=50, numbats=700, lr=1., wreg=0.000001, bidir=False, layers=1, embdim=200, encdim=400, decdim=400, negrate=1, margin=1., hingeloss=False, debug=False, checkdata=False, predencode=False, closenegsam=False, glove=False, atleastcan=0, wordchar=False, charencmode="rnn", # rnn or cnn totalrandomtest=False, rarewords=0, ): maskid = -1 tt = ticktock("predpred") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar) if closenegsam: revsamplespace, revind = buildsamplespace(entmat, worddic) tt.tock("data loaded") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def pp(widxs): print " ".join([rwd[x] if x in rwd else "" for x in widxs]) embed() numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 if rarewords > 0: rwd = {v: k for k, v in worddic.items()} print "doing rare words" trainwordcounts = getmatrixvaluecounts(traindata, entmat) stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True) fstwc = filter(lambda (x, y): y > rarewords, stwc) redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd], range(1, len(fstwc)+1))) redwdic["<RARE>"] = 0 #embed() if bidir: encdim = [encdim / 2] * layers else: encdim = [encdim] * layers # question-side model if glove: if rarewords > 0: raise Exception("glove with rare words currently not supported") wordemb = Glove(embdim).adapt(worddic) else: if rarewords > 0: wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic) #embed() else: wordemb = WordEmb(dim=embdim, worddic=worddic) if wordchar: print "wordchar model" numchars = 256 if charencmode == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2, maskid=maskid, stride=1) wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim, wordemb=wordemb, wordinnerdim=encdim, maskid=maskid, bidir=bidir) else: question_enc = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=layers) # predicate-side model if predencode: predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) ) predemb.load(entmat) """ predemb = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decgold): return (encdata, self.f(decgold)[0][0]), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return (self.em[x],), {} transf = PreProc(entmat) predtransf = transf.f """ else: predemb = VectorEmbed(numents, decdim) """transf = None predtransf = None""" # scoring scorer = MatchScore(question_enc, predemb, scorer=CosineDistance()) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, gold.shape) return datas, predrand.astype("int32") class NegIdxGenClose(object): def __init__(self, revsamsp, rng): self.revsamsp = revsamsp self.min = 0 self.max = rng def __call__(self, datas, gold): ret = np.zeros_like(gold) for i in range(gold.shape[0]): sampleset = self.revsamsp[gold[i]] if len(sampleset) > 5: ret[i] = random.sample(sampleset, 1)[0] else: ret[i] = np.random.randint(self.min, self.max) #embed() return datas, ret.astype("int32") if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p if closenegsam: tt.msg("using close neg sampler") negidxgen = NegIdxGenClose(revsamplespace, numents) else: negidxgen = NegIdxGen(numents) checkembschange = True if checkembschange: #embed() embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") embvals = embvar.d.get_value() tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(negidxgen) \ .negrate(negrate) \ .objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) tt.tock("trained") if checkembschange: embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") newembvals = embvar.d.get_value() embschanged = not np.allclose(embvals, newembvals) sumsqdiff = np.sum((newembvals - embvals)**2) print "Embeddings {}: {} sum of square diffs"\ .format("changed" if embschanged else "did not change", sumsqdiff) # evaluation tt.tick("evaluating") qenc_pred = question_enc.predict(testdata) scores = [] dontembed = True if atleastcan > 0: print "ensuring at least {} cans".format(atleastcan) if totalrandomtest: print "total randomness" for i in range(qenc_pred.shape[0]): if totalrandomtest: cans = [testgold[i]] else: cans = testsubjsrels[i][0] #+ testsubjsrels[i][1] if len(cans) < atleastcan: extracans = list(np.random.randint(0, numents, (atleastcan+50,))) extracans = list(set(extracans).difference(set(cans))) cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))] #print len(cans), cans if not dontembed: embed() #cans = set(cans) #if atleastcan > 0: # while len(cans) < atleastcan: # rancan = np.random.randint(0, numents) # if rancan not in cans: # cans.add(rancan) #cans = list(cans) if len(cans) == 0: scores.append([(-1, -np.infty)]) continue #canembs = predemb.predict.transform(predtransf)(cans) canembs = predemb.predict(cans) scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i], canembs.shape[0], axis=0), canembs) scores.append(zip(cans, scoresi)) if debug: embed() tt.progress(i, qenc_pred.shape[0], live=True) sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores] best = [sortedbesti[0][0] for sortedbesti in sortedbest] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] print("Accuracy: {}%".format(accuracy * 100))
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\ .adagrad(lr=0.1)\ .train(numbats=50, epochs=50) print m.predict([num, num - 1, num - 2, num - 1], [num, num - 1, num - 2, num - 2]) mrr = 0.0 recat10 = 0.0 recat1 = 0.0 tot = num + 1 for a in range(tot): abc = zip(range(num + 1), list(m.predict([a] * (num + 1), np.arange(0, num + 1)))) abc = sorted(abc, key=lambda (x, y): y, reverse=True) #print abc[:10] for i in range(len(abc)): if abc[i][0] == a: #print i mrr += 1. / (1 + i) if i < 10: recat10 += 1 if i < 1: recat1 += 1 break mrr /= tot recat10 /= tot recat1 /= tot print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9)