def setUp(self): enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20) x = np.random.randint(0, 100, (33, 5)) o = enc.autobuild(x) self.o = o[1][0] m = MatchScore(enc, enc) mo = m.autobuild(x, x) self.mo = mo[1][0]
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0]+1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num+1]) self.assertRaises(Exception, self.cemb.predict, [num+1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num+1) # glove against glove self.assertTrue(np.allclose(mg.predict([num, 100], [num, 100]), [np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\ .adagrad(lr=0.1)\ .train(numbats=50, epochs=50) print m.predict([num, num-1, num-2, num-1], [num, num-1, num-2, num-2]) mrr = 0.0 recat10 = 0.0 recat1 = 0.0 tot = num + 1 for a in range(tot): abc = zip(range(num+1), list(m.predict([a]*(num+1), np.arange(0, num+1)))) abc = sorted(abc, key=lambda (x, y): y, reverse=True) #print abc[:10] for i in range(len(abc)): if abc[i][0] == a: #print i mrr += 1./(1+i) if i < 10: recat10 += 1 if i < 1: recat1 += 1 break mrr /= tot recat10 /= tot recat1 /= tot print "%.3f MRR,\t%.3f [email protected],\t%.3f [email protected]" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9)
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1.0, cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, : len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec( indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir ) dist = CosineDistance() if cosine else EuclideanDistance() # DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) """ scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() """ class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = ( scorer.nstrain([charwordmat, np.arange(len(words) + 1)]) .negsamplegen(NegIdxGen(len(words))) .negrate(negrate) .objective(obj) .adagrad(lr=lr) .l2(wreg) .train(numbats=numbats, epochs=epochs) ) cwenc.save("glove2c2w.block")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim/2]*layers else: encinnerdim = [innerdim]*layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim/2]*memlayers else: innerdim = [innerdim]*memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre