def test_seq_scoring(self): vocsize = 100 dim = 10 numsam = 17 seqlen = 5 ve = VectorEmbed(vocsize, dim) m = SeqMatchScore(SeqUnroll(ve), SeqUnroll(ve), scorer=CosineDistance()) data = np.random.randint(0, vocsize, (numsam, seqlen)) #print data.shape pred = m.predict(data, data) #print pred self.assertTrue(np.allclose(np.ones_like(pred)*seqlen*1., pred))
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, usetypes=False, evalsplits=50, cosine=False, loadmodel=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True preeval = True #specemb = 100 margin = 1. evalsplits = 1 #usetypes=True #mode = "charword" #checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers emb = VectorEmbed(numwords, embdim) subjenc = EntEnc( SimpleSeq2Vec(invocsize=numwords, inpembdim=embdim, innerdim=decinnerdim, maskid=maskid, bidir=membidir)) numentembs = len(np.unique(entmat[:, 0])) repsplit = entmat[relstarts, 0] if specids: # include vectorembedder subjenc = EntEmbEnc(subjenc, numentembs, specemb) predenc = VectorEmbed(indim=numents - relstarts + 1, dim=subjenc.outdim, init="zero") entenc = CustomEntEnc(subjenc, predenc, repsplit) inpenc = CustomSeq2Pair(inpemb=emb, encdim=encinnerdim, scadim=encinnerdim, enclayers=layers, scalayers=layers, bidir=bidir, maskid=maskid, outdim=subjenc.outdim) # adjust params for enc/dec construction # encinnerdim[-1] += specemb # innerdim[-1] += specemb dist = DotDistance() if not cosine else CosineDistance() scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist} if sumhingeloss: scorerkwargs["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs) class PreProc(object): def __init__(self, entmat, wordmat=None): self.f = PreProcE(entmat) self.w = PreProcL(wordmat) if wordmat is not None else wordmat def __call__(self, encdata, decgold): # gold: idx^(batsize, seqlen) if self.w is not None: encdata = self.w(encdata)[0][0] if self.f is not None: decgold = self.f(decgold)[0][0] return (encdata, decgold), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} class PreProcL(object): def __init__(self, wordmat): self.em = Val(wordmat) def __call__(self, x): ret = self.em[x] return (ret, ), {} transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit): self.min = 0 self.max = rng self.midsplit = midsplit def __call__(self, datas, gold): entrand = np.random.randint(self.min, self.midsplit, (gold.shape[0], 1)) relrand = np.random.randint(self.midsplit, self.max, (gold.shape[0], 1)) ret = np.concatenate([entrand, relrand], axis=1) return datas, ret.astype("int32") #embed() obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) # embed() # eval if preeval: tt.tick("pre-evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") if not loadmodel: tt.tick("training") nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") scorer.save("customfullrank.scorer.save") else: scorer = SeqMatchScore.load("customfullrank.scorer.save") # eval tt.tick("evaluating") s = CustomRankSearch(inpenc, entenc, scorer.s, scorer.agg, relstarts=relstarts) eval = FullRankEval() pred, scores = s.search(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(1000): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=1000, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=200, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, balancednegidx=False, usetypes=False, evalsplits=50, relembrep=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True #preeval = True specemb = 100 margin = 1. balancednegidx = True evalsplits = 1 relembrep = True #usetypes=True #mode = "charword" #checkdata = True # load the right file maskid = -1 tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids, wordmat, chardic\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True, usetypes=usetypes, maskid=maskid) entmat = entmat.astype("int32") #embed() if subjpred is True and predpred is False: traingold = traingold[:, [0]] validgold = validgold[:, [0]] testgold = testgold[:, [0]] if predpred is True and subjpred is False: traingold = traingold[:, [1]] validgold = validgold[:, [1]] testgold = testgold[:, [1]] if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join( [rwd[xid] if xid > -1 else "" for xid in xids]) embed() print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim / 2] * memlayers else: decinnerdim = [decdim] * memlayers entenc = EntEnc( SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, innerdim=decinnerdim, maskid=maskid, bidir=membidir)) numentembs = len(np.unique(entmat[:, 0])) if specids: # include vectorembedder entenc = EntEmbEnc(entenc, numentembs, specemb) if relembrep: repsplit = entmat[relstarts, 0] entenc = EntEncRep(entenc, numentembs, repsplit) # adjust params for enc/dec construction #encinnerdim[-1] += specemb #innerdim[-1] += specemb encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim, encdim=encinnerdim, bidir=bidir, outembdim=entenc, decdim=decinnerdim, vecout=True, statetrans="matdot") scorerargs = ([encdec, SeqUnroll(entenc)], { "argproc": lambda x, y, z: ((x, y), (z, )), "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim) }) if sumhingeloss: scorerargs[1]["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1]) #scorer.save("scorer.test.save") # TODO: below this line, check and test class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decsg, decgold): # gold: idx^(batsize, seqlen) return (encdata, self.f(decsg), self.f(decgold)), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return self.em[x] transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit=None): self.min = 0 self.max = rng self.midsplit = midsplit def __call__( self, datas, sgold, gold ): # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together if self.midsplit is None or not balancednegidx: return datas, sgold, np.random.randint( self.min, self.max, gold.shape).astype("int32") else: entrand = np.random.randint(self.min, self.midsplit, gold.shape) relrand = np.random.randint(self.midsplit, self.max, gold.shape) mask = np.random.randint(0, 2, gold.shape) ret = entrand * mask + relrand * (1 - mask) return datas, sgold, ret.astype("int32") obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) traingoldshifted = shiftdata(traingold) validgoldshifted = shiftdata(validgold) #embed() # eval if preeval: tt.tick("pre-evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") negidxgenargs = ([numents], {"midsplit": relstarts}) if debug: pass #negidxgenargs = ([numents], {}) tt.tick("training") nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgoldshifted, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") #scorer.save("scorer.test.save") # eval tt.tick("evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, testgold.shape[1], candata=entmat, canids=canids, split=evalsplits, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(1000): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=100, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=400, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, balancednegidx=False, usetypes=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True #preeval = True specemb = 100 margin = 1. balancednegidx = True #usetypes=True # load the right file tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=specids, usetypes=usetypes) entmat = entmat.astype("int32") #embed() if subjpred is True and predpred is False: traingold = traingold[:, [0]] validgold = validgold[:, [0]] testgold = testgold[:, [0]] if predpred is True and subjpred is False: traingold = traingold[:, [1]] validgold = validgold[:, [1]] testgold = testgold[:, [1]] if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join([rwd[xid] if xid > -1 else "" for xid in xids]) embed() reventdic = {v: k for k, v in entdic.items()} revworddic = {v: k for k, v in worddic.items()} print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim/2]*memlayers else: decinnerdim = [decdim]*memlayers entenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, innerdim=decinnerdim, maskid=-1, bidir=membidir) if specids: # include vectorembedder numentembs = len(np.unique(entmat[:, 0])) entenc = EntEmbEnc(entenc, numentembs, specemb) # adjust params for enc/dec construction #encinnerdim[-1] += specemb #innerdim[-1] += specemb encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim, encdim=encinnerdim, bidir=bidir, outembdim=entenc, decdim=decinnerdim, vecout=True, statetrans="matdot") scorerargs = ([encdec, SeqUnroll(entenc)], {"argproc": lambda x, y, z: ((x, y), (z,)), "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)}) if sumhingeloss: scorerargs[1]["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1]) #scorer.save("scorer.test.save") # TODO: below this line, check and test class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decsg, decgold): # gold: idx^(batsize, seqlen) return (encdata, self.f(decsg), self.f(decgold)), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return self.em[x] transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit=None): self.min = 0 self.max = rng self.midsplit = midsplit def __call__(self, datas, sgold, gold): # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together if self.midsplit is None or not balancednegidx: return datas, sgold, np.random.randint(self.min, self.max, gold.shape).astype("int32") else: entrand = np.random.randint(self.min, self.midsplit, gold.shape) relrand = np.random.randint(self.midsplit, self.max, gold.shape) mask = np.random.randint(0, 2, gold.shape) ret = entrand * mask + relrand * (1 - mask) return datas, sgold, ret.astype("int32") # !!! MASKS ON OUTPUT SHOULD BE IMPLEMENTED FOR VARIABLE LENGTH OUTPUT SEQS obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) traingoldshifted = shiftdata(traingold) validgoldshifted = shiftdata(validgold) #embed() # eval if preeval: tt.tick("pre-evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, 0, testgold.shape[1], candata=entmat, canids=canids, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") negidxgenargs = ([numents], {"midsplit": relstarts}) if debug: pass #negidxgenargs = ([numents], {}) tt.tick("training") nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgoldshifted, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") #scorer.save("scorer.test.save") # eval tt.tick("evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, 0, testgold.shape[1], candata=entmat, canids=canids, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(100): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( negsammode="closest", # "close" or "random" usetypes=True, mode="concat", # "seq" or "concat" or "multi" or "multic" or "bino" glove=True, embdim=100, charencdim=100, charembdim=50, encdim=400, bidir=False, layers=1, charenc="rnn", # "cnn" or "rnn" margin=0.5, lr=0.1, numbats=700, epochs=15, gradnorm=1.0, wreg=0.0001, loadmodel="no", debug=False, debugtest=False, forcesubjincl=False, randsameval=0, numtestcans=5, multiprune=-1, checkdata=False, testnegsam=False, testmodel=False, sepcharembs=False, ): tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ (subjmat, relmat), (subjdic, reldic), worddic, \ subjinfo, (testsubjcans, relsperent) = readdata(debug=debug, numtestcans=numtestcans if numtestcans > 0 else None) if usetypes: print "building type matrix" typmat = buildtypmat(subjmat, subjinfo, worddic) subjmat = np.concatenate([typmat, subjmat], axis=1) typlen = typmat.shape[1] relsamplespace = None subjsamplespace = None if negsammode == "closest" or negsammode == "close": relsamplespace, revind = buildrelsamplespace(relmat, worddic) subjsamplespace = loadsubjsamplespace() tt.tock("data loaded") if checkdata: embed() numwords = max(worddic.values()) + 1 numsubjs = max(subjdic.values()) + 1 numrels = max(reldic.values()) + 1 maskid = -1 numchars = 256 nsrelsperent = relsperent if negsammode == "closest" else None if testnegsam: nig = NegIdxGen(numsubjs - 1, numrels - 1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent) embed() if mode == "seq" or mode == "multi": decdim = encdim elif mode == "concat" or mode == "multic" or mode == "bino": decdim = encdim / 2 else: raise Exception("unrecognized mode") print "{} mode: {} decdim".format(mode, decdim) # defining model if glove: wordemb = Glove(embdim).adapt(worddic) else: wordemb = WordEmb(dim=embdim, indim=numwords) charemb = VectorEmbed(indim=numchars, dim=charembdim) charemb2 = VectorEmbed(indim=numchars, dim=charembdim) if charenc == "cnn": print "using CNN char encoder" charenc = CNNSeqEncoder(inpemb=charemb, innerdim=[charencdim] * 2, maskid=maskid, stride=1) elif charenc == "rnn": print "using RNN char encoder" charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \ .maskoptions(maskid, MaskMode.AUTO) else: raise Exception("no other character encoding modes available") if bidir: encdim = encdim / 2 if mode != "bino": if mode == "multi" or mode == "multic": wordenc = \ SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir, numouts=2, mode="seq") else: encdim = [encdim] * layers wordenc = RNNSeqEncoder(inpemb=False, inpembdim=wordemb.outdim + charencdim, innerdim=encdim, bidir=bidir).maskoptions(MaskMode.NONE) question_encoder = TwoLevelEncoder(l1enc=charenc, l2emb=wordemb, l2enc=wordenc, maskid=maskid) else: question_encoder = BinoEncoder(charenc=charenc, wordemb=wordemb, maskid=maskid, scadim=100, encdim=encdim / 2, bidir=bidir, enclayers=layers, outdim=decdim, scabidir=True) # encode predicate on word level predemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #predemb.load(relmat) scharemb = charemb2 if sepcharembs else charemb if usetypes: # encode subj type on word level subjtypemb = SimpleSeq2Vec(inpemb=wordemb, innerdim=int(np.ceil(decdim * 1. / 2)), maskid=maskid, bidir=False, layers=1) # encode subject on character level charbidir = True charencinnerdim = int(np.floor(decdim * 1. / 2)) charenclayers = 1 if charbidir: charencinnerdim /= 2 charenclayers = 2 subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=charencinnerdim, maskid=maskid, bidir=charbidir, layers=charenclayers) subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb) else: # encode subject on character level subjemb = SimpleSeq2Vec(inpemb=scharemb, innerdim=decdim, maskid=maskid, bidir=False, layers=1) #subjemb.load(subjmat) if testmodel: embed() # package if mode == "seq": lb = SeqLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "concat": lb = ConcatLeftBlock(question_encoder) rb = RightBlock(subjemb, predemb) elif mode == "multi" or mode == "multic": lb = MultiLeftBlock(question_encoder, mode) rb = RightBlock(subjemb, predemb) elif mode == "bino": lb = question_encoder rb = RightBlock(subjemb, predemb) else: raise Exception("unrecognized mode") scorer = SeqMatchScore(lb, rb, scorer=CosineDistance(), aggregator=lambda x: x, argproc=lambda x, y, z: ((x, ), (y, z))) obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) class PreProc(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, data, gold): # gold: idxs-(batsize, 2) st = self.ef(gold[:, 0])[0][0] rt = self.rf(gold[:, 1])[0][0] return (data, st, rt), {} class PreProcE(object): def __init__(self, subjmat, relmat): self.ef = PreProcEnt(subjmat) self.rf = PreProcEnt(relmat) def __call__(self, x): subjslice = self.ef(x[:, 0])[0][0] relslice = self.rf(x[:, 1])[0][0] return (subjslice, relslice), {} class PreProcEnt(object): def __init__(self, mat): self.entmat = Val(mat) def __call__(self, x): return (self.entmat[x], ), {} transf = PreProc(subjmat, relmat) if debug: embed() if epochs > 0 and loadmodel == "no": tt.tick("training") saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)]) print("CHECKPOINTING AS: {}".format(saveid)) nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(numsubjs-1, numrels-1, relclose=relsamplespace, subjclose=subjsamplespace, relsperent=nsrelsperent)) \ .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \ .validate_on([validdata, validgold]) \ .autosavethis(scorer, "fullrank{}.model".format(saveid)) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained").tick() # saving #scorer.save("fullrank{}.model".format(saveid)) print("SAVED AS: {}".format(saveid)) if loadmodel is not "no": tt.tick("loading model") m = SeqMatchScore.load("fullrank{}.model".format(loadmodel)) #embed() lb = m.l subjemb = m.r.subjenc predemb = m.r.predenc tt.tock("loaded model") # evaluation predictor = CustomPredictor( questionencoder=lb, entityencoder=subjemb, relationencoder=predemb, #mode=mode, enttrans=transf.ef, reltrans=transf.rf, debug=debugtest, subjinfo=subjinfo) tt.tick("predicting") if forcesubjincl: # forces the intended subject entity to be among candidates for i in range(len(testsubjcans)): if testgold[i, 0] not in testsubjcans[i]: testsubjcans[i].append(testgold[i, 0]) if randsameval > 0: # generate random sampling eval data testsubjcans = np.random.randint(0, numsubjs, (testgold.shape[0], randsameval)) testrelcans = np.random.randint(0, numrels, (testgold.shape[0], randsameval)) testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1) testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1) testsubjcans = testsubjcans.tolist() testrelcans = testrelcans.tolist() prediction = predictor.predict(testdata, entcans=testsubjcans, relcans=testrelcans) else: prediction = predictor.predict(testdata, entcans=testsubjcans, relsperent=relsperent, multiprune=multiprune) tt.tock("predicted") tt.tick("evaluating") evalmat = prediction == testgold subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0] predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0] totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0] print "Test results ::::::::::::::::" print "Total Acc: \t {}".format(totalacc) print "Subj Acc: \t {}".format(subjacc) print "Pred Acc: \t {}".format(predacc) tt.tock("evaluated") def subjinspect(subjrank, gold): ret = [ (("GOLD - " if gold == x else " ") + subjinfo[x][0] + " (" + " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels", y) if x in subjinfo else (x, y) for x, y in subjrank ] return ret def inspectboth(hidecorrect=False, hidenotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] predx = testgold[i, 1] subjrank = predictor.subjranks[i] predrank = predictor.relranks[i] if hidecorrect and subjx == subjrank[0][0] and predrank[0][ 0] == predx: continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue def inspectsubjs(hidecorrect=False, hidenotincan=False, shownotincan=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.subjranks)): subjx = testgold[i, 0] subjrank = predictor.subjranks[i] if subjx == subjrank[0][0] and hidecorrect: # only look for errors continue if subjx not in [k for k, v in subjrank]: if hidenotincan: continue if shownotincan and subjx in [k for k, v in subjrank]: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string( testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format( *([ subjinfo[subjx][0], subjinfo[subjx][1], subjinfo[subjx][3], subjinfo[subjx][2] ] if subjx in subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"]))) inspres = subjinspect(subjrank, subjx) i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() def inspectpreds(hidecorrect=False): rwd = {v: k for k, v in worddic.items()} for i in range(len(predictor.relranks)): relx = testgold[i, 1] subjx = testgold[i, 0] relrank = predictor.relranks[i] if relx == relrank[0][0] and hidecorrect: continue print "test question {}: {} \t GOLD: {}".format( i, wordids2string(testdata[i, :, 0], rwd), wordids2string(relmat[relx, :], rwd)) inspres = [(("GOLD - " if relx == x else " ") + wordids2string(relmat[x], rwd), y) for x, y in relrank] i = 1 for inspre in inspres: print "{}:\t{}\t{}".format(i, inspre[1], inspre[0]) if i % 50 == 0: inp() i += 1 inp() embed()