def __init__(self, inpemb, encdim=100, scadim=100, maskid=0, bidir=False, scalayers=1, enclayers=1, outdim=100, **kw): super(CustomSeq2Pair, self).__init__(**kw) self.tosca = SimpleSeq2Sca(inpemb=inpemb, inpembdim=inpemb.outdim, innerdim=scadim, maskid=maskid, bidir=bidir, layers=scalayers) self.subjenc = SimpleSeq2Vec(inpemb=inpemb, inpembdim=inpemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=enclayers) self.predenc = SimpleSeq2Vec(inpemb=inpemb, inpembdim=inpemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=enclayers) self.subjmd = MatDot(self.subjenc.outdim, outdim) self.predmd = MatDot(self.predenc.outdim, outdim)
def setUp(self): enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=20) x = np.random.randint(0, 100, (33, 5)) o = enc.autobuild(x) self.o = o[1][0] m = MatchScore(enc, enc) mo = m.autobuild(x, x) self.mo = mo[1][0]
def test_mask(self): np.random.seed(1337) enc = SimpleSeq2Vec(indim=100, inpembdim=10, innerdim=4, maskid=-1, layers=2).all_outputs() x = np.random.randint(0, 100, (33, 5)) maskr = np.random.randint(1, x.shape[1], (x.shape[0],)) for i in range(x.shape[0]): x[i, maskr[i]:] = -1 pred = enc.predict(x) print maskr print x print pred print pred.shape
def test_auto_mask_within_seq2vec(self): Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" batsize = 11 seqlen = 3 seqblank = 2 wordlen = 3 wordblank = 2 numchars = 20 numwords = 100 encdim = 4 embdim = 50 innerdim = 2 worddata = np.random.randint(0, numwords, (batsize, seqlen, 1)) worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1 worddata = np.concatenate([worddata, worddatablank], axis=1) chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen)) charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblank], axis=2) charblankblank = np.zeros( (batsize, seqblank, wordlen + wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblankblank], axis=1) data = np.concatenate([worddata, chardata], axis=2) wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=0) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False) enc = Seq2Vec(wordemb, rnn, maskid=-1) enc.enc.with_outputs() finalpred, pred = enc.predict(data) #print pred.shape, finalpred.shape #print pred[0], finalpred[0] i = 1 while i < pred.shape[1]: self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]), i >= seqlen) i += 1
def run(epochs=50, numbats=25, lr=0.1, layers=1, embdim=100, encdim=200, bidir=False, mode="wordchar", # "char" or "word" or "wordchar" maxlen=75, maxwordlen=15, ): maskid = -1 (traindata, traingold), (testdata, testgold), dic = \ readdata("../../../data/hatespeech/train.csv", "../../../data/hatespeech/test.csv", masksym=maskid, mode=mode, maxlen=maxlen) # data stats print "class distribution in train: {}% positive".format(np.sum(traingold)*1. / np.sum(np.ones_like(traingold))) print "class distribution in test: {}% positive".format(np.sum(testgold)*1. / np.sum(np.ones_like(testgold))) inpemb = VectorEmbed(indim=len(dic), dim=embdim) encdim = [encdim] * layers if mode == "wordchar": enc = WordCharSentEnc(charemb=inpemb, charinnerdim=embdim, wordemb=False, wordinnerdim=encdim, maskid=maskid, bidir=bidir) else: enc = SimpleSeq2Vec(inpemb=inpemb, innerdim=encdim, maskid=maskid, bidir=bidir) m = SMOWrap(enc, outdim=2, nobias=True) #print enc.predict(traindata[:5, :]) m = m.train([traindata], traingold)\ .adadelta(lr=lr).grad_total_norm(1.0)\ .cross_entropy().split_validate(6, random=True).cross_entropy().accuracy()\ .train(numbats=numbats, epochs=epochs) m.save("hatemodel.{}.Emb{}D.Enc{}D.{}L.model".format(mode, embdim, encdim, layers))
def test_auto_mask_within_seq2vec(self): Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" batsize = 11 seqlen = 3 seqblank = 2 wordlen = 3 wordblank = 2 numchars = 20 numwords = 100 encdim = 4 embdim = 50 innerdim = 2 worddata = np.random.randint(0, numwords, (batsize, seqlen, 1)) worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1 worddata = np.concatenate([worddata, worddatablank], axis=1) chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen)) charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblank], axis=2) charblankblank = np.zeros((batsize, seqblank, wordlen+wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblankblank], axis=1) data = np.concatenate([worddata, chardata], axis=2) wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=0) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False) enc = Seq2Vec(wordemb, rnn, maskid=-1) enc.enc.with_outputs finalpred, pred = enc.predict(data) #print pred.shape, finalpred.shape #print pred[0], finalpred[0] i = 1 while i < pred.shape[1]: self.assertEqual(np.allclose(pred[:, i-1, :], pred[:, i, :]), i >= seqlen) i += 1
def run( epochs=10, numbats=100, numsam=10000, lr=0.1, datap="../../../data/simplequestions/datamat.char.pkl", innerdim=200, wreg=0.00005, bidir=False, keepmincount=5, mem=False, sameenc=False, memaddr="dot", memattdim=100, membidir=False, memlayers=1, memmaxwords=5, memmaxchars=20, layers=1, ): (traindata, traingold), (validdata, validgold), (testdata, testgold), chardic, entdic\ = readdata(datap) if mem: memdata = getcharmemdata(entdic, chardic, maxwords=memmaxwords, maxchar=memmaxchars) print traindata.shape, testdata.shape numchars = max(chardic.values()) + 1 numrels = max(entdic.values()) + 1 print numchars, numrels if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers enc = SimpleSeq2Vec(indim=numchars, inpembdim=None, innerdim=encinnerdim, maskid=-1, bidir=bidir) if mem: if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers memindim = numchars memenc = SimpleSeq2Vec(indim=memindim, inpembdim=None, innerdim=innerdim, maskid=-1, bidir=membidir) if memaddr is None or memaddr == "dot": memaddr = DotMemAddr elif memaddr == "lin": memaddr = LinearGateMemAddr dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim) else: dec = SimpleVec2Idx(indim=innerdim, outdim=numrels) m = Seq2Idx(enc, dec) m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\ .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\ .train(numbats=numbats, epochs=epochs) pred = m.predict(testdata) print pred.shape evalres = evaluate(np.argmax(pred, axis=1), testgold) print str(evalres) + "%"
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10) ]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre
def run(epochs=10, numbats=100, numsam=10000, lr=0.1, datap="../../../data/simplequestions/datamat.wordchar.pkl", embdim=50, encdim=50, innerdim=200, wreg=0.00005, bidir=False, keepmincount=5, sameenc=False, memaddr="dot", memattdim=100, layers=1, embtrainfrac=0.0, mem=False, membidir=False, memlayers=1, sharedwordenc=False): """ Memory match-based glove-based word-level relation classification """ (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\ = readdata(datap) # get words from relation names, update word dic memdata = getmemdata(entdic, worddic, chardic) # get glove and transform word mats to glove index space d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac) traindata, validdata, testdata, memdata = \ [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2) for x in [traindata, validdata, testdata, memdata]] print traindata.shape, testdata.shape #embed() numwords = max(worddic.values()) + 1 # don't use this, use glove numchars = max(chardic.values()) + 1 numrels = max(entdic.values()) + 1 if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, encinnerdim, bidir=bidir) enc = Seq2Vec(wordemb, rnn, maskid=-1) if mem: memembdim = embdim memencdim = encdim if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers if not sharedwordenc: memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) else: memwordemb = wordemb memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim + memencdim, innerdim, bidir=membidir) memenc = Seq2Vec(memwordemb, memrnn, maskid=-1) if memaddr is None or memaddr == "dot": memaddr = DotMemAddr elif memaddr == "lin": memaddr = LinearGateMemAddr dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim) else: dec = SimpleVec2Idx(indim=innerdim, outdim=numrels) m = Seq2Idx(enc, dec) m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\ .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\ .train(numbats=numbats, epochs=epochs) pred = m.predict(testdata) print pred.shape evalres = evaluate(np.argmax(pred, axis=1), testgold) print str(evalres) + "%"
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True preeval = True #specemb = 100 margin = 1. evalsplits = 1 #usetypes=True #mode = "charword" #checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) subjenc = EntEnc( SimpleSeq2Vec(invocsize=numwords, inpembdim=embdim, innerdim=decinnerdim, maskid=maskid, bidir=membidir)) numentembs = len(np.unique(entmat[:, 0])) repsplit = entmat[relstarts, 0] if specids: # include vectorembedder subjenc = EntEmbEnc(subjenc, numentembs, specemb) predenc = VectorEmbed(indim=numents - relstarts + 1, dim=subjenc.outdim, init="zero") entenc = CustomEntEnc(subjenc, predenc, repsplit) inpenc = CustomSeq2Pair(inpemb=emb, encdim=encinnerdim, scadim=encinnerdim, enclayers=layers, scalayers=layers, bidir=bidir, maskid=maskid, outdim=subjenc.outdim) # adjust params for enc/dec construction # encinnerdim[-1] += specemb # innerdim[-1] += specemb dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} if sumhingeloss: scorerkwargs["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit): self.min = 0 self.max = rng self.midsplit = midsplit def __call__(self, datas, gold): entrand = np.random.randint(self.min, self.midsplit, (gold.shape[0], 1)) relrand = np.random.randint(self.midsplit, self.max, (gold.shape[0], 1)) ret = np.concatenate([entrand, relrand], axis=1) return datas, ret.astype("int32") #embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) # embed() # eval if preeval: tt.tick("pre-evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") if not loadmodel: tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") scorer.save("customfullrank.scorer.save") else: scorer = SeqMatchScore.load("customfullrank.scorer.save") # eval tt.tick("evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(1000): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( epochs=10, numbats=100, numsam=10000, lr=0.1, datap="../../../data/simplequestions/datamat.wordchar.pkl", embdim=50, encdim=50, innerdim=200, wreg=0.00005, bidir=False, keepmincount=5, sameenc=False, memaddr="dot", memattdim=100, layers=1, embtrainfrac=0.0, mem=False, membidir=False, memlayers=1, sharedwordenc=False ): """ Memory match-based glove-based word-level relation classification """ (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\ = readdata(datap) # get words from relation names, update word dic memdata = getmemdata(entdic, worddic, chardic) # get glove and transform word mats to glove index space d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac) traindata, validdata, testdata, memdata = \ [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2) for x in [traindata, validdata, testdata, memdata]] print traindata.shape, testdata.shape #embed() numwords = max(worddic.values()) + 1 # don't use this, use glove numchars = max(chardic.values()) + 1 numrels = max(entdic.values()) + 1 if bidir: encinnerdim = [innerdim/2]*layers else: encinnerdim = [innerdim]*layers wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) rnn, lastdim = SimpleSeq2Vec.makernu(embdim+encdim, encinnerdim, bidir=bidir) enc = Seq2Vec(wordemb, rnn, maskid=-1) if mem: memembdim = embdim memencdim = encdim if membidir: innerdim = [innerdim/2]*memlayers else: innerdim = [innerdim]*memlayers if not sharedwordenc: memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) else: memwordemb = wordemb memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim+memencdim, innerdim, bidir=membidir) memenc = Seq2Vec(memwordemb, memrnn, maskid=-1) if memaddr is None or memaddr == "dot": memaddr = DotMemAddr elif memaddr == "lin": memaddr = LinearGateMemAddr dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim) else: dec = SimpleVec2Idx(indim=innerdim, outdim=numrels) m = Seq2Idx(enc, dec) m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\ .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\ .train(numbats=numbats, epochs=epochs) pred = m.predict(testdata) print pred.shape evalres = evaluate(np.argmax(pred, axis=1), testgold) print str(evalres) + "%"
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings hingeloss = True numbats = 10 lr = 0.02 epochs = 1 printpreds = True preeval = True # specemb = 100 margin = 1. evalsplits = 1 # usetypes=True mode = "charword" # checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic \ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") # transform for predpred traingold = traingold[:, 1] - relstarts validgold = validgold[:, 1] - relstarts testgold = testgold[:, 1] - relstarts if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) predemb = VectorEmbed(numents - relstarts + 1, decdim, init="uniform") inpenc = SimpleSeq2Vec(inpemb=emb, inpembdim=emb.outdim, innerdim=encinnerdim, maskid=maskid, bidir=bidir, layers=layers) dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} scorer = MatchScore(inpenc, predemb, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, (gold.shape[0], )) return datas, predrand.astype("int32") # embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(NegIdxGen(numents - relstarts))\ .negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") # eval canids = np.arange(start=0, stop=numents - relstarts) predembs = predemb.predict(canids) # (numrels, embdim) tt.tick("evaluating") predencs = inpenc.predict(testdata) # (batsize, embdim) scores = np.zeros((predencs.shape[0], predembs.shape[0])) for i in range(predencs.shape[0]): scores[i, :] = \ scorer.s.predict(np.repeat(predencs[np.newaxis, i], predembs.shape[0], axis=0), predembs) tt.progress(i, predencs.shape[0], live=True) best = np.argmax(scores, axis=1) sortedbest = [ sorted(zip(np.arange(scores.shape[1]), list(scores[i])), reverse=True, key=lambda (x, y): y) for i in range(scores.shape[0]) ] sortedbestmat = np.array([[x for (x, y) in z] for z in sortedbest], dtype="int32") # MRR mrr = 0.0 for i in range(sortedbestmat.shape[1]): mrr += np.sum(sortedbestmat[:, i] == testgold) * 1. / (i + 1) mrr /= testgold.shape[0] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] # R@X def ratx(ratnum): return rat(ratnum, sortedbestmat, testgold) def rat(ratnum, sortedpred, gold): acc = 0.0 for i in range(min(ratnum, sortedbestmat.shape[1])): acc += 1.0 * np.sum(sortedpred[:, i] == gold) acc /= testgold.shape[0] return acc print "Accuracy: {}%".format(accuracy * 100) print "MRR: {}".format(mrr) print "Recall: @10: {}%\t @50: {}%\t @100: {}%".format( ratx(10) * 100, ratx(50) * 100, ratx(100) * 100) embed() tt.tock("evaluated")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1., cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format( len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, :len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec(indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir) dist = CosineDistance() if cosine else EuclideanDistance() #DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) ''' scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() ''' class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = scorer.nstrain([charwordmat, np.arange(len(words)+1)])\ .negsamplegen(NegIdxGen(len(words))).negrate(negrate)\ .objective(obj).adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) cwenc.save("glove2c2w.block")
def run(epochs=50, numbats=700, lr=1., wreg=0.000001, bidir=False, layers=1, embdim=200, encdim=400, decdim=400, negrate=1, margin=1., hingeloss=False, debug=False, checkdata=False, predencode=False, closenegsam=False, glove=False, atleastcan=0, wordchar=False, charencmode="rnn", # rnn or cnn totalrandomtest=False, rarewords=0, ): maskid = -1 tt = ticktock("predpred") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, testsubjsrels = readdata(wordchar=wordchar) if closenegsam: revsamplespace, revind = buildsamplespace(entmat, worddic) tt.tock("data loaded") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def pp(widxs): print " ".join([rwd[x] if x in rwd else "" for x in widxs]) embed() numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 if rarewords > 0: rwd = {v: k for k, v in worddic.items()} print "doing rare words" trainwordcounts = getmatrixvaluecounts(traindata, entmat) stwc = sorted(trainwordcounts.items(), key=lambda (x, y): y, reverse=True) fstwc = filter(lambda (x, y): y > rarewords, stwc) redwdic = dict(zip([rwd[k] for k, v in fstwc if k != maskid and k in rwd], range(1, len(fstwc)+1))) redwdic["<RARE>"] = 0 #embed() if bidir: encdim = [encdim / 2] * layers else: encdim = [encdim] * layers # question-side model if glove: if rarewords > 0: raise Exception("glove with rare words currently not supported") wordemb = Glove(embdim).adapt(worddic) else: if rarewords > 0: wordemb = WordEmb(dim=embdim, worddic=redwdic).adapt(worddic) #embed() else: wordemb = WordEmb(dim=embdim, worddic=worddic) if wordchar: print "wordchar model" numchars = 256 if charencmode == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(indim=numchars, inpembdim=50, innerdim=[embdim]*2, maskid=maskid, stride=1) wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim+embdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_enc = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_enc = WordCharSentEnc(numchars=256, charembdim=50, charinnerdim=embdim, wordemb=wordemb, wordinnerdim=encdim, maskid=maskid, bidir=bidir) else: question_enc = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=encdim, maskid=maskid, bidir=bidir, layers=layers) # predicate-side model if predencode: predemb = MemVec(SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) ) predemb.load(entmat) """ predemb = SimpleSeq2Vec(inpemb=wordemb, inpembdim=wordemb.outdim, innerdim=decdim, maskid=maskid, bidir=bidir, layers=layers) class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decgold): return (encdata, self.f(decgold)[0][0]), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return (self.em[x],), {} transf = PreProc(entmat) predtransf = transf.f """ else: predemb = VectorEmbed(numents, decdim) """transf = None predtransf = None""" # scoring scorer = MatchScore(question_enc, predemb, scorer=CosineDistance()) class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): predrand = np.random.randint(self.min, self.max, gold.shape) return datas, predrand.astype("int32") class NegIdxGenClose(object): def __init__(self, revsamsp, rng): self.revsamsp = revsamsp self.min = 0 self.max = rng def __call__(self, datas, gold): ret = np.zeros_like(gold) for i in range(gold.shape[0]): sampleset = self.revsamsp[gold[i]] if len(sampleset) > 5: ret[i] = random.sample(sampleset, 1)[0] else: ret[i] = np.random.randint(self.min, self.max) #embed() return datas, ret.astype("int32") if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p if closenegsam: tt.msg("using close neg sampler") negidxgen = NegIdxGenClose(revsamplespace, numents) else: negidxgen = NegIdxGen(numents) checkembschange = True if checkembschange: #embed() embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") embvals = embvar.d.get_value() tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]) \ .negsamplegen(negidxgen) \ .negrate(negrate) \ .objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) tt.tock("trained") if checkembschange: embvar = wordemb.W if embvar is None: if hasattr(wordemb, "inner"): embvar = wordemb.inner.W else: raise Exception("no clue where to find embedding values") newembvals = embvar.d.get_value() embschanged = not np.allclose(embvals, newembvals) sumsqdiff = np.sum((newembvals - embvals)**2) print "Embeddings {}: {} sum of square diffs"\ .format("changed" if embschanged else "did not change", sumsqdiff) # evaluation tt.tick("evaluating") qenc_pred = question_enc.predict(testdata) scores = [] dontembed = True if atleastcan > 0: print "ensuring at least {} cans".format(atleastcan) if totalrandomtest: print "total randomness" for i in range(qenc_pred.shape[0]): if totalrandomtest: cans = [testgold[i]] else: cans = testsubjsrels[i][0] #+ testsubjsrels[i][1] if len(cans) < atleastcan: extracans = list(np.random.randint(0, numents, (atleastcan+50,))) extracans = list(set(extracans).difference(set(cans))) cans = cans + extracans[:max(0, min(len(extracans), atleastcan - len(cans)))] #print len(cans), cans if not dontembed: embed() #cans = set(cans) #if atleastcan > 0: # while len(cans) < atleastcan: # rancan = np.random.randint(0, numents) # if rancan not in cans: # cans.add(rancan) #cans = list(cans) if len(cans) == 0: scores.append([(-1, -np.infty)]) continue #canembs = predemb.predict.transform(predtransf)(cans) canembs = predemb.predict(cans) scoresi = scorer.s.predict(np.repeat(qenc_pred[np.newaxis, i], canembs.shape[0], axis=0), canembs) scores.append(zip(cans, scoresi)) if debug: embed() tt.progress(i, qenc_pred.shape[0], live=True) sortedbest = [sorted(cansi, key=lambda (x, y): y, reverse=True) for cansi in scores] best = [sortedbesti[0][0] for sortedbesti in sortedbest] # Accuracy accuracy = np.sum(best == testgold) * 1. / testgold.shape[0] print("Accuracy: {}%".format(accuracy * 100))
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, balancednegidx=False, usetypes=False, evalsplits=50, relembrep=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True #preeval = True specemb = 100 margin = 1. balancednegidx = True evalsplits = 1 relembrep = True #usetypes=True #mode = "charword" #checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") #embed() if subjpred is True and predpred is False: traingold = traingold[:, [0]] validgold = validgold[:, [0]] testgold = testgold[:, [0]] if predpred is True and subjpred is False: traingold = traingold[:, [1]] validgold = validgold[:, [1]] testgold = testgold[:, [1]] if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers entenc = EntEnc( SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, innerdim=decinnerdim, maskid=maskid, bidir=membidir)) numentembs = len(np.unique(entmat[:, 0])) if specids: # include vectorembedder entenc = EntEmbEnc(entenc, numentembs, specemb) if relembrep: repsplit = entmat[relstarts, 0] entenc = EntEncRep(entenc, numentembs, repsplit) # adjust params for enc/dec construction #encinnerdim[-1] += specemb #innerdim[-1] += specemb encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim, encdim=encinnerdim, bidir=bidir, outembdim=entenc, decdim=decinnerdim, vecout=True, statetrans="matdot") scorerargs = ([encdec, SeqUnroll(entenc)], { "argproc": lambda x, y, z: ((x, y), (z, )), "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim) }) if sumhingeloss: scorerargs[1]["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1]) #scorer.save("scorer.test.save") # TODO: below this line, check and test class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decsg, decgold): # gold: idx^(batsize, seqlen) return (encdata, self.f(decsg), self.f(decgold)), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return self.em[x] transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit=None): self.min = 0 self.max = rng self.midsplit = midsplit def __call__( self, datas, sgold, gold ): # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together if self.midsplit is None or not balancednegidx: return datas, sgold, np.random.randint( self.min, self.max, gold.shape).astype("int32") else: entrand = np.random.randint(self.min, self.midsplit, gold.shape) relrand = np.random.randint(self.midsplit, self.max, gold.shape) mask = np.random.randint(0, 2, gold.shape) ret = entrand * mask + relrand * (1 - mask) return datas, sgold, ret.astype("int32") obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) traingoldshifted = shiftdata(traingold) validgoldshifted = shiftdata(validgold) #embed() # eval if preeval: tt.tick("pre-evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") negidxgenargs = ([numents], {"midsplit": relstarts}) if debug: pass #negidxgenargs = ([numents], {}) tt.tick("training") nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgoldshifted, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") #scorer.save("scorer.test.save") # eval tt.tick("evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(1000): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( negsammode="closest", # "close" or "random" usetypes=True, mode="concat", # "seq" or "concat" or "multi" or "multic" or "bino" glove=True, embdim=100, charencdim=100, charembdim=50, encdim=400, bidir=False, layers=1, charenc="rnn", # "cnn" or "rnn" margin=0.5, lr=0.1, numbats=700, epochs=15, gradnorm=1.0, wreg=0.0001, loadmodel="no", debug=False, debugtest=False, forcesubjincl=False, randsameval=0, numtestcans=5, multiprune=-1, checkdata=False, testnegsam=False, testmodel=False, sepcharembs=False, ): tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ (subjmat, relmat), (subjdic, reldic), worddic, \ subjinfo, (testsubjcans, relsperent) = readdata(debug=debug, numtestcans=numtestcans if numtestcans > 0 else None) if usetypes: print "building type matrix" typmat = buildtypmat(subjmat, subjinfo, worddic) subjmat = np.concatenate([typmat, subjmat], axis=1) typlen = typmat.shape[1] relsamplespace = None subjsamplespace = None if negsammode == "closest" or negsammode == "close": relsamplespace, revind = buildrelsamplespace(relmat, worddic) subjsamplespace = loadsubjsamplespace() tt.tock("data loaded") if checkdata: embed() numwords = max(worddic.values()) + 1 numsubjs = max(subjdic.values()) + 1 numrels = max(reldic.values()) + 1 maskid = -1 numchars = 256 nsrelsperent = relsperent if negsammode == "closest" else None if testnegsam: nig = NegIdxGen(numsubjs - 1, numrels - 1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent) embed() if mode == "seq" or mode == "multi": decdim = encdim elif mode == "concat" or mode == "multic" or mode == "bino": decdim = encdim / 2 else: raise Exception("unrecognized mode") print "{} mode: {} decdim".format(mode, decdim) # defining model if glove: wordemb = Glove(embdim).adapt(worddic) else: wordemb = WordEmb(dim=embdim, indim=numwords) charemb = VectorEmbed(indim=numchars, dim=charembdim) charemb2 = VectorEmbed(indim=numchars, dim=charembdim) if charenc == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(inpemb=charemb, innerdim=[charencdim] * 2, maskid=maskid, stride=1) elif charenc == "rnn": print "using RNN char encoder" charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \ .maskoptions(maskid, MaskMode.AUTO) else: raise Exception("no other character encoding modes available") if bidir: encdim = encdim / 2 if mode != "bino": if mode == "multi" or mode == "multic": wordenc = \ SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir, numouts=2, mode="seq") else: encdim = [encdim] * layers wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_encoder = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_encoder = BinoEncoder(charenc=charenc, wordemb=wordemb, maskid=maskid, scadim=100, encdim=encdim / 2, bidir=bidir, enclayers=layers, outdim=decdim, scabidir=True) # encode predicate on word level predemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #predemb.load(relmat) scharemb = charemb2 if sepcharembs else charemb if usetypes: # encode subj type on word level subjtypemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=int(np.ceil(decdim * 1. / 2)), maskid=maskid, bidir=False, layers=1) # encode subject on character level charbidir = True charencinnerdim = int(np.floor(decdim * 1. / 2)) charenclayers = 1 if charbidir: charencinnerdim /= 2 charenclayers = 2 subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=charencinnerdim, maskid=maskid, bidir=charbidir, layers=charenclayers) subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb) else: # encode subject on character level subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #subjemb.load(subjmat) if testmodel: embed() # package if mode == "seq": lb = SeqLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "concat": lb = ConcatLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "multi" or mode == "multic": lb = MultiLeftBlock(question_encoder, mode) rb = RightBlock(subjemb, predemb) elif mode == "bino": lb = question_encoder rb = RightBlock(subjemb, predemb) else: raise Exception("unrecognized mode") scorer = SeqMatchScore(lb, rb, scorer=CosineDistance(), aggregator=lambda x: x, argproc=lambda x, y, z: ((x, ), (y, z))) obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) class PreProc(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, data, gold): # gold: idxs-(batsize, 2) st = self.ef(gold[:, 0])[0][0] rt = self.rf(gold[:, 1])[0][0] return (data, st, rt), {} class PreProcE(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, x): subjslice = self.ef(x[:, 0])[0][0] relslice = self.rf(x[:, 1])[0][0] return (subjslice, relslice), {} class PreProcEnt(object): def __init__(self, mat): self.entmat = Val(mat) def __call__(self, x): return (self.entmat[x], ), {} transf = PreProc(subjmat, relmat) if debug: embed() if epochs > 0 and loadmodel == "no": tt.tick("training") saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)]) print("CHECKPOINTING AS: {}".format(saveid)) nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numsubjs-1, numrels-1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent)) \ .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \ .validate_on([validdata, validgold]) \ .autosavethis(scorer, "fullrank{}.model".format(saveid)) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained").tick() # saving #scorer.save("fullrank{}.model".format(saveid)) print("SAVED AS: {}".format(saveid)) if loadmodel is not "no": tt.tick("loading model") m = SeqMatchScore.load("fullrank{}.model".format(loadmodel)) #embed() lb = m.l subjemb = m.r.subjenc predemb = m.r.predenc tt.tock("loaded model") # evaluation predictor = CustomPredictor( questionencoder=lb, entityencoder=subjemb, relationencoder=predemb, #mode=mode, enttrans=transf.ef, reltrans=transf.rf, debug=debugtest, subjinfo=subjinfo) tt.tick("predicting") if forcesubjincl: # forces the intended subject entity to be among candidates for i in range(len(testsubjcans)): if testgold[i, 0] not in testsubjcans[i]: testsubjcans[i].append(testgold[i, 0]) if randsameval > 0: # generate random sampling eval data testsubjcans = np.random.randint(0, numsubjs, (testgold.shape[0], randsameval)) testrelcans = np.random.randint(0, numrels, (testgold.shape[0], randsameval)) testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1) testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1) testsubjcans = testsubjcans.tolist() testrelcans = testrelcans.tolist() prediction = predictor.predict(testdata, entcans=testsubjcans, relcans=testrelcans) else: prediction = predictor.predict(testdata, entcans=testsubjcans, relsperent=relsperent, multiprune=multiprune) tt.tock("predicted") tt.tick("evaluating") evalmat = prediction == testgold subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0] predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0] totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0] print "Test results ::::::::::::::::" print "Total Acc: \t {}".format(totalacc) print "Subj Acc: \t {}".format(subjacc) print "Pred Acc: \t {}".format(predacc) tt.tock("evaluated") def subjinspect(subjrank, gold): ret = [ (("GOLD - " if gold == x else " ") + subjinfo[x][0] + " (" + " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels", y) if x in subjinfo else (x, y) for x, y in subjrank ] return ret def inspectboth(hidecorrect=False, hidenotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] predx = testgold[i, 1] subjrank = predictor.subjranks[i] predrank = predictor.relranks[i] if hidecorrect and subjx == subjrank[0][0] and predrank[0][ 0] == predx: continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue def inspectsubjs(hidecorrect=False, hidenotincan=False, shownotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] subjrank = predictor.subjranks[i] if subjx == subjrank[0][0] and hidecorrect: # only look for errors continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue if shownotincan and subjx in [k for k, v in subjrank]: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string( testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format( *([ subjinfo[subjx][0], subjinfo[subjx][1], subjinfo[subjx][3], subjinfo[subjx][2] ] if subjx in subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"]))) inspres = subjinspect(subjrank, subjx) i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() def inspectpreds(hidecorrect=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.relranks)): relx = testgold[i, 1] subjx = testgold[i, 0] relrank = predictor.relranks[i] if relx == relrank[0][0] and hidecorrect: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string(testdata[i, :, 0], rwd), wordids2string(relmat[relx, :], rwd)) inspres = [(("GOLD - " if relx == x else " ") + wordids2string(relmat[x], rwd), y) for x, y in relrank] i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() embed()