def test_shape(self): batsize = 10 seqlen = 3 ldim = 5 rdim = 5 l = np.random.random((batsize, ldim)) r = np.random.random((batsize, seqlen, rdim)) b = CosineDistance() pred, extra = b.predict(l, r, _extra_outs=["lnorms", "rnorms"]) print extra print pred self.assertEqual(pred.shape, (batsize, seqlen)) self.assertTrue(np.all((pred - np.ones_like(pred)) < 0))
def test_shapes(self): batsize, seqlen = 100, 7 criterionshape = (batsize, 10) datashape = (batsize, seqlen, 10) attgen = AttGen(CosineDistance()) # generate data criterion = np.random.random(criterionshape) data = np.random.random(datashape) # predict and test pred = attgen.predict(criterion, data) self.assertEqual(pred.shape, (batsize, seqlen)) self.assertTrue(np.allclose(pred.sum(axis=1), np.ones((pred.shape[0],))))
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) vdata = np.arange(num) negrate = 5 def obj(p, n): return n - p m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\ .adagrad(lr=0.1).objective(obj) \ .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \ .train(numbats=50, epochs=29, returnerrors=True) #.writeresultstofile("testingresultswriter.tsv") \ tdata = np.arange(num) tt = ticktock("eval") tt.tick() mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata) tt.tock("evaluated test data") print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9) print verr self.assertTrue( np.allclose(np.asarray([mrr, recat1, recat10]), np.asarray(verr[-1][1:])))
def test_seq_scoring(self): vocsize = 100 dim = 10 numsam = 17 seqlen = 5 ve = VectorEmbed(vocsize, dim) m = SeqMatchScore(SeqUnroll(ve), SeqUnroll(ve), scorer=CosineDistance()) data = np.random.randint(0, vocsize, (numsam, seqlen)) #print data.shape pred = m.predict(data, data) #print pred self.assertTrue(np.allclose(np.ones_like(pred)*seqlen*1., pred))
def test_mask(self): batsize, seqlen = 100, 7 criterionshape = (batsize, 10) datashape = (batsize, seqlen, 10) attgen = AttGen(CosineDistance()) # generate data criterion = np.random.random(criterionshape) data = np.random.random(datashape) mask = np.ones((batsize, seqlen)) maskids = np.random.randint(2, seqlen+1, (batsize,)) for i in range(maskids.shape[0]): mask[i, maskids[i]:] = 0 # predict and test pred = attgen.predict(criterion, data, mask) maskthrough = np.not_equal(pred, 0) self.assertTrue(np.all(maskthrough == mask))
def __init__(self, inpvocsize=None, inpembdim=None, inpemb=None, inpencinnerdim=None, bidir=False, maskid=None, dropout=False, rnu=GRU, inpencoder=None, memvocsize=None, memembdim=None, memembmat=None, memencinnerdim=None, memencoder=None, inp_att_dist=CosineDistance(), mem_att_dist=CosineDistance(), inp_attention=None, mem_attention=None, coredims=None, corernu=GRU, core=None, explicit_interface=False, scalaraggdim=None, write_value_dim=None, nsteps=100, posvecdim=None, mem_pos_repr=None, inp_pos_repr=None, inp_addr_extractor=None, mem_addr_extractor=None, write_addr_extractor=None, write_addr_generator=None, write_addr_dist=CosineDistance(), write_value_generator=None, write_value_extractor=None, mem_erase_generator=None, mem_change_generator=None, memsampler=None, memsamplemethod=None, memsampletemp=0.3, **kw): # INPUT ENCODING if inpencoder is None: inpencoder = SeqEncoder.RNN(indim=inpvocsize, inpembdim=inpembdim, inpemb=inpemb, innerdim=inpencinnerdim, bidir=bidir, maskid=maskid, dropout_in=dropout, dropout_h=dropout, rnu=rnu).all_outputs() lastinpdim = inpencinnerdim if not issequence( inpencinnerdim) else inpencinnerdim[-1] else: lastinpdim = inpencoder.block.layers[-1].innerdim # MEMORY ENCODING if memembmat is None: memembmat = param((memvocsize, memembdim), name="memembmat").glorotuniform() if memencoder is None: memencoder = SeqEncoder.RNN(inpemb=False, innerdim=memencinnerdim, bidir=bidir, dropout_in=dropout, dropout_h=dropout, rnu=rnu, inpembdim=memembdim).all_outputs() lastmemdim = memencinnerdim if not issequence( memencinnerdim) else memencinnerdim[-1] else: lastmemdim = memencoder.block.layers[-1].innerdim # POSITION VECTORS if posvecdim is not None and inp_pos_repr is None: inp_pos_repr = RNNWithoutInput(posvecdim, dropout=dropout) if posvecdim is not None and mem_pos_repr is None: mem_pos_repr = RNNWithoutInput(posvecdim, dropout=dropout) xtra_dim = posvecdim if posvecdim is not None else 0 # CORE RNN - THE THINKER if core is None: corelayers, _ = MakeRNU.fromdims( [lastinpdim + lastmemdim + xtra_dim * 2] + coredims, rnu=corernu, dropout_in=dropout, dropout_h=dropout, param_init_states=True) core = RecStack(*corelayers) lastcoredim = core.get_statespec()[-1][0][1][0] # ATTENTIONS if mem_attention is None: mem_attention = Attention(mem_att_dist) if inp_attention is None: inp_attention = Attention(inp_att_dist) if write_addr_generator is None: write_addr_generator = AttGen(write_addr_dist) # WRITE VALUE if write_value_generator is None: write_value_generator = WriteValGenerator(write_value_dim, memvocsize, dropout=dropout) # MEMORY SAMPLER if memsampler is not None: assert (memsamplemethod is None) if memsamplemethod is not None: assert (memsampler is None) memsampler = GumbelSoftmax(temperature=memsampletemp) ################ STATE INTERFACES ################# if not explicit_interface: if inp_addr_extractor is None: inp_addr_extractor = Forward(lastcoredim, lastinpdim + xtra_dim, dropout=dropout) if mem_addr_extractor is None: inp_addr_extractor = Forward(lastcoredim, lastmemdim + xtra_dim, dropout=dropout) # WRITE INTERFACE if write_addr_extractor is None: write_addr_extractor = Forward(lastcoredim, lastmemdim + xtra_dim, dropout=dropout) if write_value_extractor is None: write_value_extractor = Forward(lastcoredim, write_value_dim, dropout=dropout) # MEM UPDATE INTERFACE if mem_erase_generator is None: mem_erase_generator = StateToScalar(lastcoredim, scalaraggdim) if mem_change_generator is None: mem_change_generator = StateToScalar(lastcoredim, scalaraggdim) else: inp_addr_extractor, mem_addr_extractor, write_addr_extractor, \ write_value_extractor, mem_erase_generator, mem_change_generator = \ make_vector_slicers(0, lastinpdim + xtra_dim, lastmemdim + xtra_dim, lastmemdim + xtra_dim, write_value_dim, 1, 1) super(SimpleBulkNN, self).__init__(inpencoder=inpencoder, memembmat=memembmat, memencoder=memencoder, inp_attention=inp_attention, mem_attention=mem_attention, core=core, memsampler=memsampler, nsteps=nsteps, inp_addr_extractor=inp_addr_extractor, mem_addr_extractor=mem_addr_extractor, write_addr_extractor=write_addr_extractor, write_addr_generator=write_addr_generator, mem_erase_generator=mem_erase_generator, mem_change_generator=mem_change_generator, write_value_generator=write_value_generator, write_value_extractor=write_value_extractor, inp_pos_repr=inp_pos_repr, mem_pos_repr=mem_pos_repr, **kw)
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True preeval = True #specemb = 100 margin = 1. evalsplits = 1 #usetypes=True #mode = "charword" #checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) subjenc = EntEnc( SimpleSeq2Vec(invocsize=numwords, inpembdim=embdim, innerdim=decinnerdim, maskid=maskid, bidir=membidir)) numentembs = len(np.unique(entmat[:, 0])) repsplit = entmat[relstarts, 0] if specids: # include vectorembedder subjenc = EntEmbEnc(subjenc, numentembs, specemb) predenc = VectorEmbed(indim=numents - relstarts + 1, dim=subjenc.outdim, init="zero") entenc = CustomEntEnc(subjenc, predenc, repsplit) inpenc = CustomSeq2Pair(inpemb=emb, encdim=encinnerdim, scadim=encinnerdim, enclayers=layers, scalayers=layers, bidir=bidir, maskid=maskid, outdim=subjenc.outdim) # adjust params for enc/dec construction # encinnerdim[-1] += specemb # innerdim[-1] += specemb dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} if sumhingeloss: scorerkwargs["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit): self.min = 0 self.max = rng self.midsplit = midsplit def __call__(self, datas, gold): entrand = np.random.randint(self.min, self.midsplit, (gold.shape[0], 1)) relrand = np.random.randint(self.midsplit, self.max, (gold.shape[0], 1)) ret = np.concatenate([entrand, relrand], axis=1) return datas, ret.astype("int32") #embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) # embed() # eval if preeval: tt.tick("pre-evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") if not loadmodel: tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") scorer.save("customfullrank.scorer.save") else: scorer = SeqMatchScore.load("customfullrank.scorer.save") # eval tt.tick("evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(1000): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings hingeloss = True numbats = 10 lr = 0.02 epochs = 1 printpreds = True preeval = True # specemb = 100 margin = 1. evalsplits = 1 # usetypes=True mode = "charword" # checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic \ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") # transform for predpred traingold = traingold[:, 1] - relstarts validgold = validgold[:, 1] - relstarts testgold = testgold[:, 1] - relstarts if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform") inpenc = SimpleSeq2Vec(inpemb=emb, inpembdim=emb.outdim, innerdim=encinnerdim, maskid=maskid, bidir=bidir, layers=layers) dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} scorer = MatchScore(inpenc, predemb, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, (gold.shape[0], )) return datas, predrand.astype("int32") # embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(NegIdxGen(numents - relstarts))\ .negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") # eval canids = np.arange(start=0, stop=numents - relstarts) predembs = predemb.predict(canids) # (numrels, embdim) tt.tick("evaluating") predencs = inpenc.predict(testdata) # (batsize, embdim) scores = np.zeros((predencs.shape[0], predembs.shape[0])) for i in range(predencs.shape[0]): scores[i, :] = \ scorer.s.predict(np.repeat(predencs[np.newaxis, i], predembs.shape[0], axis=0), predembs) tt.progress(i, predencs.shape[0], live=True) best = np.argmax(scores, axis=1) sortedbest = [ sorted(zip(np.arange(scores.shape[1]), list(scores[i])), reverse=True, key=lambda (x, y): y) for i in range(scores.shape[0]) ] sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest], dtype="int32") # MRR mrr = 0.0 for i in range(sortedbestmat.shape[1]): mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1) mrr /= testgold.shape[0] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] # R@X def ratx(ratnum): return rat(ratnum, sortedbestmat, testgold) def rat(ratnum, sortedpred, gold): acc = 0.0 for i in range(min(ratnum, sortedbestmat.shape[1])): acc += 1.0 * np.sum(sortedpred[:, i] == gold) acc /= testgold.shape[0] return acc print "Accuracy: {}%".format(accuracy * 100) print "MRR: {}".format(mrr) print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format( ratx(10) * 100, ratx(50) * 100, ratx(100) * 100) embed() tt.tock("evaluated")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1., cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format( len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, :len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec(indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir) dist = CosineDistance() if cosine else EuclideanDistance() #DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) ''' scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() ''' class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\ .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\ .objective(obj).adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) cwenc.save("glove2c2w.block")
def run(epochs=50, numbats=700, lr=1., wreg=0.000001, bidir=False, layers=1, embdim=200, encdim=400, decdim=400, negrate=1, margin=1., hingeloss=False, debug=False, checkdata=False, predencode=False, closenegsam=False, glove=False, atleastcan=0, wordchar=False, charencmode="rnn", # rnn or cnn totalrandomtest=False, rarewords=0, ): maskid = -1 tt = ticktock("predpred") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar) if closenegsam: revsamplespace, revind = buildsamplespace(entmat, worddic) tt.tock("data loaded") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def pp(widxs): print " ".join([rwd[x] if x in rwd else "" for x in widxs]) embed() numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 if rarewords > 0: rwd = {v: k for k, v in worddic.items()} print "doing rare words" trainwordcounts = getmatrixvaluecounts(traindata, entmat) stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True) fstwc = filter(lambda (x, y): y > rarewords, stwc) redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd], range(1, len(fstwc)+1))) redwdic["<RARE>"] = 0 #embed() if bidir: encdim = [encdim / 2] * layers else: encdim = [encdim] * layers # question-side model if glove: if rarewords > 0: raise Exception("glove with rare words currently not supported") wordemb = Glove(embdim).adapt(worddic) else: if rarewords > 0: wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic) #embed() else: wordemb = WordEmb(dim=embdim, worddic=worddic) if wordchar: print "wordchar model" numchars = 256 if charencmode == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2, maskid=maskid, stride=1) wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim, wordemb=wordemb, wordinnerdim=encdim, maskid=maskid, bidir=bidir) else: question_enc = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=layers) # predicate-side model if predencode: predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) ) predemb.load(entmat) """ predemb = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decgold): return (encdata, self.f(decgold)[0][0]), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return (self.em[x],), {} transf = PreProc(entmat) predtransf = transf.f """ else: predemb = VectorEmbed(numents, decdim) """transf = None predtransf = None""" # scoring scorer = MatchScore(question_enc, predemb, scorer=CosineDistance()) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, gold.shape) return datas, predrand.astype("int32") class NegIdxGenClose(object): def __init__(self, revsamsp, rng): self.revsamsp = revsamsp self.min = 0 self.max = rng def __call__(self, datas, gold): ret = np.zeros_like(gold) for i in range(gold.shape[0]): sampleset = self.revsamsp[gold[i]] if len(sampleset) > 5: ret[i] = random.sample(sampleset, 1)[0] else: ret[i] = np.random.randint(self.min, self.max) #embed() return datas, ret.astype("int32") if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p if closenegsam: tt.msg("using close neg sampler") negidxgen = NegIdxGenClose(revsamplespace, numents) else: negidxgen = NegIdxGen(numents) checkembschange = True if checkembschange: #embed() embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") embvals = embvar.d.get_value() tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(negidxgen) \ .negrate(negrate) \ .objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) tt.tock("trained") if checkembschange: embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") newembvals = embvar.d.get_value() embschanged = not np.allclose(embvals, newembvals) sumsqdiff = np.sum((newembvals - embvals)**2) print "Embeddings {}: {} sum of square diffs"\ .format("changed" if embschanged else "did not change", sumsqdiff) # evaluation tt.tick("evaluating") qenc_pred = question_enc.predict(testdata) scores = [] dontembed = True if atleastcan > 0: print "ensuring at least {} cans".format(atleastcan) if totalrandomtest: print "total randomness" for i in range(qenc_pred.shape[0]): if totalrandomtest: cans = [testgold[i]] else: cans = testsubjsrels[i][0] #+ testsubjsrels[i][1] if len(cans) < atleastcan: extracans = list(np.random.randint(0, numents, (atleastcan+50,))) extracans = list(set(extracans).difference(set(cans))) cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))] #print len(cans), cans if not dontembed: embed() #cans = set(cans) #if atleastcan > 0: # while len(cans) < atleastcan: # rancan = np.random.randint(0, numents) # if rancan not in cans: # cans.add(rancan) #cans = list(cans) if len(cans) == 0: scores.append([(-1, -np.infty)]) continue #canembs = predemb.predict.transform(predtransf)(cans) canembs = predemb.predict(cans) scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i], canembs.shape[0], axis=0), canembs) scores.append(zip(cans, scoresi)) if debug: embed() tt.progress(i, qenc_pred.shape[0], live=True) sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores] best = [sortedbesti[0][0] for sortedbesti in sortedbest] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] print("Accuracy: {}%".format(accuracy * 100))
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) m = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(5)\ .adagrad(lr=0.1)\ .train(numbats=50, epochs=50) print m.predict([num, num - 1, num - 2, num - 1], [num, num - 1, num - 2, num - 2]) mrr = 0.0 recat10 = 0.0 recat1 = 0.0 tot = num + 1 for a in range(tot): abc = zip(range(num + 1), list(m.predict([a] * (num + 1), np.arange(0, num + 1)))) abc = sorted(abc, key=lambda (x, y): y, reverse=True) #print abc[:10] for i in range(len(abc)): if abc[i][0] == a: #print i mrr += 1. / (1 + i) if i < 10: recat10 += 1 if i < 1: recat1 += 1 break mrr /= tot recat10 /= tot recat1 /= tot print "%.3f MRR,\t%.3f MR@10,\t%.3f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9)
def __init__(self, inpvocsize=400, inpembdim=None, inpemb=None, outvocsize=100, outembdim=None, outemb=None, encdim=100, decdim=100, bidir=False, rnu=GRU, statetrans=None, vecout=None, inconcat=True, outconcat=False, maskid=-1, dropout=False, attdist=CosineDistance(), sepatt=False, encoder=None, decoder=None, attention=None, **kw): self.encinnerdim = [encdim] if not issequence(encdim) else encdim self.decinnerdim = [decdim] if not issequence(decdim) else decdim self.dropout = dropout # encoder if encoder is None: if sepatt: enc = self._getencoder_sepatt(indim=inpvocsize, inpembdim=inpembdim, inpemb=inpemb, innerdim=self.encinnerdim, bidir=bidir, maskid=maskid, dropout_in=dropout, dropout_h=dropout, rnu=rnu) else: enc = self._getencoder(indim=inpvocsize, inpembdim=inpembdim, inpemb=inpemb, innerdim=self.encinnerdim, bidir=bidir, maskid=maskid, dropout_in=dropout, dropout_h=dropout, rnu=rnu) else: enc = encoder if attention is None: attention = self._getattention(attdist, sepatt=sepatt) self.lastencinnerdim = enc.outdim if decoder is None: dec = self._getdecoder(outvocsize=outvocsize, outembdim=outembdim, outemb=outemb, maskid=maskid, attention=attention, lastencinnerdim=self.lastencinnerdim, decinnerdim=self.decinnerdim, inconcat=inconcat, outconcat=outconcat, softmaxout=vecout, dropout=dropout, rnu=rnu) else: dec = decoder self.lastdecinnerdim = self.decinnerdim[-1] self.statetrans_setting = statetrans statetrans = self._build_state_trans(self.statetrans_setting) super(SimpleSeqEncDecAtt, self).__init__(enc, dec, statetrans=statetrans, **kw)
def run( negsammode="closest", # "close" or "random" usetypes=True, mode="concat", # "seq" or "concat" or "multi" or "multic" or "bino" glove=True, embdim=100, charencdim=100, charembdim=50, encdim=400, bidir=False, layers=1, charenc="rnn", # "cnn" or "rnn" margin=0.5, lr=0.1, numbats=700, epochs=15, gradnorm=1.0, wreg=0.0001, loadmodel="no", debug=False, debugtest=False, forcesubjincl=False, randsameval=0, numtestcans=5, multiprune=-1, checkdata=False, testnegsam=False, testmodel=False, sepcharembs=False, ): tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ (subjmat, relmat), (subjdic, reldic), worddic, \ subjinfo, (testsubjcans, relsperent) = readdata(debug=debug, numtestcans=numtestcans if numtestcans > 0 else None) if usetypes: print "building type matrix" typmat = buildtypmat(subjmat, subjinfo, worddic) subjmat = np.concatenate([typmat, subjmat], axis=1) typlen = typmat.shape[1] relsamplespace = None subjsamplespace = None if negsammode == "closest" or negsammode == "close": relsamplespace, revind = buildrelsamplespace(relmat, worddic) subjsamplespace = loadsubjsamplespace() tt.tock("data loaded") if checkdata: embed() numwords = max(worddic.values()) + 1 numsubjs = max(subjdic.values()) + 1 numrels = max(reldic.values()) + 1 maskid = -1 numchars = 256 nsrelsperent = relsperent if negsammode == "closest" else None if testnegsam: nig = NegIdxGen(numsubjs - 1, numrels - 1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent) embed() if mode == "seq" or mode == "multi": decdim = encdim elif mode == "concat" or mode == "multic" or mode == "bino": decdim = encdim / 2 else: raise Exception("unrecognized mode") print "{} mode: {} decdim".format(mode, decdim) # defining model if glove: wordemb = Glove(embdim).adapt(worddic) else: wordemb = WordEmb(dim=embdim, indim=numwords) charemb = VectorEmbed(indim=numchars, dim=charembdim) charemb2 = VectorEmbed(indim=numchars, dim=charembdim) if charenc == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(inpemb=charemb, innerdim=[charencdim] * 2, maskid=maskid, stride=1) elif charenc == "rnn": print "using RNN char encoder" charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \ .maskoptions(maskid, MaskMode.AUTO) else: raise Exception("no other character encoding modes available") if bidir: encdim = encdim / 2 if mode != "bino": if mode == "multi" or mode == "multic": wordenc = \ SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir, numouts=2, mode="seq") else: encdim = [encdim] * layers wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_encoder = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_encoder = BinoEncoder(charenc=charenc, wordemb=wordemb, maskid=maskid, scadim=100, encdim=encdim / 2, bidir=bidir, enclayers=layers, outdim=decdim, scabidir=True) # encode predicate on word level predemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #predemb.load(relmat) scharemb = charemb2 if sepcharembs else charemb if usetypes: # encode subj type on word level subjtypemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=int(np.ceil(decdim * 1. / 2)), maskid=maskid, bidir=False, layers=1) # encode subject on character level charbidir = True charencinnerdim = int(np.floor(decdim * 1. / 2)) charenclayers = 1 if charbidir: charencinnerdim /= 2 charenclayers = 2 subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=charencinnerdim, maskid=maskid, bidir=charbidir, layers=charenclayers) subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb) else: # encode subject on character level subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #subjemb.load(subjmat) if testmodel: embed() # package if mode == "seq": lb = SeqLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "concat": lb = ConcatLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "multi" or mode == "multic": lb = MultiLeftBlock(question_encoder, mode) rb = RightBlock(subjemb, predemb) elif mode == "bino": lb = question_encoder rb = RightBlock(subjemb, predemb) else: raise Exception("unrecognized mode") scorer = SeqMatchScore(lb, rb, scorer=CosineDistance(), aggregator=lambda x: x, argproc=lambda x, y, z: ((x, ), (y, z))) obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) class PreProc(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, data, gold): # gold: idxs-(batsize, 2) st = self.ef(gold[:, 0])[0][0] rt = self.rf(gold[:, 1])[0][0] return (data, st, rt), {} class PreProcE(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, x): subjslice = self.ef(x[:, 0])[0][0] relslice = self.rf(x[:, 1])[0][0] return (subjslice, relslice), {} class PreProcEnt(object): def __init__(self, mat): self.entmat = Val(mat) def __call__(self, x): return (self.entmat[x], ), {} transf = PreProc(subjmat, relmat) if debug: embed() if epochs > 0 and loadmodel == "no": tt.tick("training") saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)]) print("CHECKPOINTING AS: {}".format(saveid)) nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numsubjs-1, numrels-1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent)) \ .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \ .validate_on([validdata, validgold]) \ .autosavethis(scorer, "fullrank{}.model".format(saveid)) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained").tick() # saving #scorer.save("fullrank{}.model".format(saveid)) print("SAVED AS: {}".format(saveid)) if loadmodel is not "no": tt.tick("loading model") m = SeqMatchScore.load("fullrank{}.model".format(loadmodel)) #embed() lb = m.l subjemb = m.r.subjenc predemb = m.r.predenc tt.tock("loaded model") # evaluation predictor = CustomPredictor( questionencoder=lb, entityencoder=subjemb, relationencoder=predemb, #mode=mode, enttrans=transf.ef, reltrans=transf.rf, debug=debugtest, subjinfo=subjinfo) tt.tick("predicting") if forcesubjincl: # forces the intended subject entity to be among candidates for i in range(len(testsubjcans)): if testgold[i, 0] not in testsubjcans[i]: testsubjcans[i].append(testgold[i, 0]) if randsameval > 0: # generate random sampling eval data testsubjcans = np.random.randint(0, numsubjs, (testgold.shape[0], randsameval)) testrelcans = np.random.randint(0, numrels, (testgold.shape[0], randsameval)) testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1) testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1) testsubjcans = testsubjcans.tolist() testrelcans = testrelcans.tolist() prediction = predictor.predict(testdata, entcans=testsubjcans, relcans=testrelcans) else: prediction = predictor.predict(testdata, entcans=testsubjcans, relsperent=relsperent, multiprune=multiprune) tt.tock("predicted") tt.tick("evaluating") evalmat = prediction == testgold subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0] predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0] totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0] print "Test results ::::::::::::::::" print "Total Acc: \t {}".format(totalacc) print "Subj Acc: \t {}".format(subjacc) print "Pred Acc: \t {}".format(predacc) tt.tock("evaluated") def subjinspect(subjrank, gold): ret = [ (("GOLD - " if gold == x else " ") + subjinfo[x][0] + " (" + " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels", y) if x in subjinfo else (x, y) for x, y in subjrank ] return ret def inspectboth(hidecorrect=False, hidenotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] predx = testgold[i, 1] subjrank = predictor.subjranks[i] predrank = predictor.relranks[i] if hidecorrect and subjx == subjrank[0][0] and predrank[0][ 0] == predx: continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue def inspectsubjs(hidecorrect=False, hidenotincan=False, shownotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] subjrank = predictor.subjranks[i] if subjx == subjrank[0][0] and hidecorrect: # only look for errors continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue if shownotincan and subjx in [k for k, v in subjrank]: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string( testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format( *([ subjinfo[subjx][0], subjinfo[subjx][1], subjinfo[subjx][3], subjinfo[subjx][2] ] if subjx in subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"]))) inspres = subjinspect(subjrank, subjx) i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() def inspectpreds(hidecorrect=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.relranks)): relx = testgold[i, 1] subjx = testgold[i, 0] relrank = predictor.relranks[i] if relx == relrank[0][0] and hidecorrect: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string(testdata[i, :, 0], rwd), wordids2string(relmat[relx, :], rwd)) inspres = [(("GOLD - " if relx == x else " ") + wordids2string(relmat[x], rwd), y) for x, y in relrank] i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() embed()