def train_and_eval(runid, module_prep_model, c, glove, vocab, gr, s0, grt, s0t, s0pad=s0pad, s1pad=s1pad, do_eval=True): print('Model') model = build_model(glove, vocab, module_prep_model, c, s0pad=s0pad, s1pad=s1pad) print('Training') if c.get('balance_class', False): one_ratio = np.sum(gr['score'] == 1) / len(gr['score']) class_weight = {'score': {0: one_ratio, 1: 0.5}} else: class_weight = {} # XXX: samples_per_epoch is in brmson/keras fork, TODO fit_generator()? model.fit(gr, validation_data=grt, callbacks=[AnsSelCB(s0t, grt), ModelCheckpoint('weights-'+runid+'-bestval.h5', save_best_only=True, monitor='mrr', mode='max'), EarlyStopping(monitor='mrr', mode='max', patience=4)], class_weight=class_weight, batch_size=c['batch_size'], nb_epoch=c['nb_epoch'], samples_per_epoch=int(len(s0)*c['epoch_fract'])) model.save_weights('weights-'+runid+'-final.h5', overwrite=True) if c['ptscorer'] is None: model.save_weights('weights-'+runid+'-bestval.h5', overwrite=True) model.load_weights('weights-'+runid+'-bestval.h5') if do_eval: print('Predict&Eval (best epoch)') ev.eval_anssel(model.predict(gr)['score'][:,0], s0, gr['score'], 'Train') ev.eval_anssel(model.predict(grt)['score'][:,0], s0t, grt['score'], 'Val') return model
def train_and_eval(runid, module_prep_model, c, glove, vocab, gr, s0, grt, s0t): print('Model') model = build_model(glove, vocab, module_prep_model, c) print('Training') # XXX: samples_per_epoch is in brmson/keras fork, TODO fit_generator()? model.fit(gr, validation_data=grt, callbacks=[AnsSelCB(s0t, grt), ModelCheckpoint('weights-'+runid+'-bestval.h5', save_best_only=True, monitor='mrr', mode='max')], batch_size=160, nb_epoch=16, samples_per_epoch=5000) model.save_weights('weights-'+runid+'-final.h5', overwrite=True) ev.eval_anssel(model.predict(gr)['score'][:,0], s0, gr['score'], 'Train') ev.eval_anssel(model.predict(grt)['score'][:,0], s0t, grt['score'], 'Val')
def eval(self, model): res = [] for gr, fname in [(self.gr, self.trainf), (self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue # In case of prescoring pruning, we want to predict only # on the prescoring subset, but evaluate on the complete # dataset, actually! Therefore, we then unprune again. # TODO: Cache the pruning gr_p = self.prescoring_apply(gr) ypred = self.predict(model, gr_p) gr, ypred = graph_input_unprune( gr, gr_p, ypred, 0. if self.c['loss'] == 'binary_crossentropy' else float(-1e15)) res.append( ev.eval_anssel(ypred, gr['si0'] + gr['sj0'], gr['si1'] + gr['sj1'], gr['score'], fname, MAP=True)) return tuple(res)
def eval(self, model): res = [] for gr, fname in [(self.gr, self.trainf), (self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue ypred = model.predict(gr)['score'][:,0] res.append(ev.eval_anssel(ypred, gr['si0'], gr['si1'], gr['score'], fname, MAP=True)) return tuple(res)
def train_and_eval(runid, module_prep_model, c, glove, vocab, gr, s0, grt, s0t): print('Model') model = build_model(glove, vocab, module_prep_model, c) print('Training') # XXX: samples_per_epoch is in brmson/keras fork, TODO fit_generator()? model.fit(gr, validation_data=grt, callbacks=[ AnsSelCB(s0t, grt), ModelCheckpoint('weights-' + runid + '-bestval.h5', save_best_only=True, monitor='mrr', mode='max') ], batch_size=160, nb_epoch=16, samples_per_epoch=5000) model.save_weights('weights-' + runid + '-final.h5', overwrite=True) ev.eval_anssel(model.predict(gr)['score'][:, 0], s0, gr['score'], 'Train') ev.eval_anssel(model.predict(grt)['score'][:, 0], s0t, grt['score'], 'Val')
def transfer_eval(runid, weightsf, module_prep_model, c, glove, vocab, gr, grv): print('Model') model = anssel_train.build_model(glove, vocab, module_prep_model, c, s0pad=s0pad, s1pad=s1pad, optimizer=c['opt'], fix_layers=c['fix_layers']) print('Model (weights)') model.load_weights(weightsf) ev.eval_anssel(model.predict(grv)['score'][:,0], grv['si0'], grv['score'], 'anssel Val (bef. train)') print('Training') if c.get('balance_class', False): one_ratio = np.sum(gr['score'] == 1) / len(gr['score']) class_weight = {'score': {0: one_ratio, 1: 0.5}} else: class_weight = {} model.fit(gr, validation_data=grv, callbacks=[AnsSelCB(s0v, grv), ModelCheckpoint('weights-'+runid+'-bestval.h5', save_best_only=True, monitor='mrr', mode='max'), EarlyStopping(monitor='mrr', mode='max', patience=4)], class_weight=class_weight, batch_size=conf['batch_size'], nb_epoch=conf['nb_epoch'], samples_per_epoch=int(len(gr['score'])*conf['epoch_fract'])) model.save_weights('weights-'+runid+'-final.h5', overwrite=True) print('Predict&Eval (best epoch)') model.load_weights('weights-'+runid+'-bestval.h5') ev.eval_anssel(model.predict(grv)['score'][:,0], grv['si0'], grv['score'], 'anssel Val')
def eval(self, model): res = [] for gr, fname in [(self.gr, self.trainf), (self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue ypred = model.predict(gr)['score'][:, 0] res.append( ev.eval_anssel(ypred, gr['si0'], gr['si1'], gr['score'], fname, MAP=True)) return tuple(res)
def eval(self, model): res = [] for gr, fname in [(self.gr, self.trainf), (self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue # In case of prescoring pruning, we want to predict only # on the prescoring subset, but evaluate on the complete # dataset, actually! Therefore, we then unprune again. # TODO: Cache the pruning gr_p = self.prescoring_apply(gr) ypred = model.predict(gr_p)['score'][:,0] gr, ypred = graph_input_unprune(gr, gr_p, ypred, 0. if self.c['loss'] == 'binary_crossentropy' else float(-1e15)) res.append(ev.eval_anssel(ypred, gr['si0'], gr['si1'], gr['score'], fname, MAP=True)) return tuple(res)
modelname, weightsfile, trainf, valf, trec_qrels_file, trec_top_file = sys.argv[1:7] params = sys.argv[7:] module = importlib.import_module('.'+modelname, 'models') conf, ps, h = anssel_train.config(module.config, params) print('GloVe') glove = emb.GloVe(N=conf['embdim']) print('Dataset') s0, s1, y, vocab, gr = anssel_train.load_set(trainf) s0t, s1t, yt, _, grt = anssel_train.load_set(valf, vocab) print('Model') model = anssel_train.build_model(glove, vocab, module.prep_model, conf) print('Weights') model.load_weights(weightsfile) print('Prediction') ypred = model.predict(gr)['score'][:,0] ypredt = model.predict(grt)['score'][:,0] ev.eval_anssel(ypred, s0, y, trainf) ev.eval_anssel(ypredt, s0t, yt, valf) with open(trec_qrels_file, 'wt') as f: save_trec_qrels(f, s0t, s1t, yt) with open(trec_top_file, 'wt') as f: save_trec_top(f, s0t, s1t, ypredt, modelname)
'anssel-yodaqa/curatedv1-training.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'anssel-yodaqa/curatedv1-val.csv', subsample0=1) model = prep_model(glove) model.compile(loss={'score': 'binary_crossentropy'}, optimizer='adam') model.fit({ 'e0': Xtrain[0], 'e1': Xtrain[1], 'score': ytrain }, batch_size=20, nb_epoch=2000, validation_data={ 'e0': Xtest[0], 'e1': Xtest[1], 'score': ytest }) ev.eval_anssel( model.predict({ 'e0': Xtrain[0], 'e1': Xtrain[1] })['score'][:, 0], Xtrain[0], ytrain, 'Train') ev.eval_anssel( model.predict({ 'e0': Xtest[0], 'e1': Xtest[1] })['score'][:, 0], Xtest[0], ytest, 'Test')
parser.add_argument("-N", help="GloVe dim", type=int, default=50) # for our naive method, 300**2 would be too much parser.add_argument("--balance", help="whether to manually balance the dataset", type=int, default=1) parser.add_argument("--wang", help="whether to run on Wang inst. of YodaQA dataset", type=int, default=0) args = parser.parse_args() glove = emb.GloVe(N=args.N) if args.wang == 1: Xtrain, ytrain = load_set(glove, 'anssel-wang/train-all.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'anssel-wang/test.csv', subsample0=1) else: Xtrain, ytrain = load_set(glove, 'anssel-yodaqa/curatedv1-training.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'anssel-yodaqa/curatedv1-val.csv', subsample0=1) logreg = linear_model.LogisticRegression(C=0.01, verbose=1, n_jobs=7) logreg.fit(logreg_M(*Xtrain), ytrain) ev.eval_anssel(logreg.predict_proba(logreg_M(*Xtrain))[:, 1], Xtrain[0], ytrain, 'Train') ev.eval_anssel(logreg.predict_proba(logreg_M(*Xtest))[:, 1], Xtest[0], ytest, 'Test') """ Performance tuning on anssel-yodaqa: * Completely unbalanced, C=1 Train Accuracy: 0.899176 (y=0 0.983992, y=1 0.334139) Train MRR: 0.626233 (on training set, y=0 is subsampled!) Test Accuracy: 0.926688 (y=0 0.965770, y=1 0.095908) Test MRR: 0.218704 * sklearn balancing (class_weight='auto'), C=1 Train Accuracy: 0.816569 (y=0 0.812480, y=1 0.843807) Train MRR: 0.620643 (on training set, y=0 is subsampled!) Test Accuracy: 0.714450 (y=0 0.727787, y=1 0.430946) Test MRR: 0.235821
Xtrain, ytrain = load_set(glove, 'anssel-wang/train-all.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'anssel-wang/test.csv', subsample0=1) else: Xtrain, ytrain = load_set(glove, 'anssel-yodaqa/curatedv1-training.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'anssel-yodaqa/curatedv1-val.csv', subsample0=1) logreg = linear_model.LogisticRegression(C=0.01, verbose=1, n_jobs=7) logreg.fit(logreg_M(*Xtrain), ytrain) ev.eval_anssel( logreg.predict_proba(logreg_M(*Xtrain))[:, 1], Xtrain[0], ytrain, 'Train') ev.eval_anssel( logreg.predict_proba(logreg_M(*Xtest))[:, 1], Xtest[0], ytest, 'Test') """ Performance tuning on anssel-yodaqa: * Completely unbalanced, C=1 Train Accuracy: 0.899176 (y=0 0.983992, y=1 0.334139) Train MRR: 0.626233 (on training set, y=0 is subsampled!) Test Accuracy: 0.926688 (y=0 0.965770, y=1 0.095908) Test MRR: 0.218704 * sklearn balancing (class_weight='auto'), C=1 Train Accuracy: 0.816569 (y=0 0.812480, y=1 0.843807) Train MRR: 0.620643 (on training set, y=0 is subsampled!) Test Accuracy: 0.714450 (y=0 0.727787, y=1 0.430946) Test MRR: 0.235821
if conf['embdim'] is not None: print('GloVe') glove = emb.GloVe(N=conf['embdim']) else: glove = None print('Dataset') s0, s1, y, vocab, gr = anssel_train.load_set(trainf) s0t, s1t, yt, _, grt = anssel_train.load_set(valf, vocab) print('Model') model = anssel_train.build_model(glove, vocab, module.prep_model, conf) print('Weights') model.load_weights(weightsfile) print('Prediction') ypred = model.predict(gr)['score'][:,0] ypredt = model.predict(grt)['score'][:,0] ev.eval_anssel(ypred, s0, y, trainf) ev.eval_anssel(ypredt, s0t, yt, valf) with open(trec_qrels_file, 'wt') as f: save_trec_qrels(f, s0t, s1t, yt) with open(trec_top_file, 'wt') as f: save_trec_top(f, s0t, s1t, ypredt, modelname) mapt = trec_eval_get(trec_qrels_file, trec_top_file, 'map') print('%s MAP: %f' % (valf, mapt))
e0, e1, y = load_set(train_filename, st, args.cache_dir) e0t, e1t, yt = load_set(val_filename, st, args.cache_dir) model = prep_model(N) model.compile(loss={'score': 'binary_crossentropy'}, optimizer=Adam(lr=0.001)) hist = model.fit({ 'e0': e0, 'e1': e1, 'score': y }, batch_size=20, nb_epoch=2000, validation_data={ 'e0': e0t, 'e1': e1t, 'score': yt }) ev.eval_anssel( model.predict({ 'e0': e0, 'e1': e1 })['score'][:, 0], e0, e1, yt, 'Train') ev.eval_anssel( model.predict({ 'e0': e0t, 'e1': e1t })['score'][:, 0], e0t, e1t, yt, 'Test')
parser.add_argument("--wang", help="whether to run on Wang inst. of YodaQA dataset", type=int, default=0) parser.add_argument("--cache_dir", help="directory where to save/load cached datasets", type=str, default="") # possible: /storage/ostrava1/home/nadvorj1/skip-thoughts/ parser.add_argument("--skipthoughts_datadir", help="directory with precomputed Skip_thoughts embeddings (containing bi_skip.npz...)", type=str, default="") args = parser.parse_args() if args.wang == 1: train_filename = "data/anssel/wang/train-all.csv" val_filename = "data/anssel/wang/test.csv" else: train_filename = "data/anssel/yodaqa/curatedv2-training.csv" val_filename = "data/anssel/yodaqa/curatedv2-val.csv" st = emb.SkipThought(datadir=args.skipthoughts_datadir, uni_bi="combined") N = st.N e0, e1, y = load_set(train_filename, st, args.cache_dir) e0t, e1t, yt = load_set(val_filename, st, args.cache_dir) model = prep_model(N) model.compile(loss={'score': 'binary_crossentropy'}, optimizer=Adam(lr=0.001)) hist = model.fit({'e0': e0, 'e1': e1, 'score': y}, batch_size=20, nb_epoch=2000, validation_data={'e0': e0t, 'e1': e1t, 'score': yt}) ev.eval_anssel(model.predict({'e0': e0, 'e1': e1})['score'][:, 0], e0, yt, 'Train') ev.eval_anssel(model.predict({'e0': e0t, 'e1': e1t})['score'][:, 0], e0t, yt, 'Test')
mrr = [] mrrv = [] mrrt = [] mapt = [] for i in range(niter): runid = '%s-%x-%02d' % (modelname, h, i) print('RunID: %s (%s)' % (runid, ps)) model = anssel_train.train_and_eval(runid, module.prep_model, conf, glove, vocab, gr, s0, grv, s0v, do_eval=False) print('Predict&Eval (best val epoch)') ypred = model.predict(gr)['score'][:,0] ypredv = model.predict(grv)['score'][:,0] ypredt = model.predict(grt)['score'][:,0] mrr.append(ev.eval_anssel(ypred, s0, y, trainf)) mrrv.append(ev.eval_anssel(ypredv, s0v, yv, valf)) mrrt.append(ev.eval_anssel(ypredt, s0t, yt, testf)) mapt.append(ev_map(s0t, s1t, yt, ypredt, testf)) rdata = {'ps': ps, 'ypred': (ypred, ypredv, ypredt), 'mrr': (mrr, mrrv, mrrt), 'map': (None, None, mapt)} pickle.dump(rdata, open('%s-res.pickle' % (runid,), 'wb'), protocol=2) brr = stat(niter, trainf, 'MRR', mrr) brrv = stat(niter, valf, 'MRR', mrrv) bapt = stat(niter, testf, 'MAP', mapt) brrt = stat(niter, testf, 'MRR', mrrt) # README table format: print( '| % -24s | %.6f | %.6f | %.6f | %.6f | %s' % (modelname, np.mean(mrr), np.mean(mrrv), np.mean(mapt), np.mean(mrrt), '(defaults)' if not params else ' '.join(['``%s``' % (p,) for p in params])))
layer=Activation(oact)) model.add_output(name='score', input='scoreS') return model if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark CNN on a bipartite ranking task (answer selection)") parser.add_argument("-N", help="GloVe dim", type=int, default=300) parser.add_argument("--wang", help="whether to run on Wang inst. of YodaQA dataset", type=int, default=0) parser.add_argument("--params", help="additional training parameters", type=str, default='') args = parser.parse_args() glove = emb.GloVe(N=args.N) if args.wang == 1: s0, s1, y, vocab, gr = load_set('data/anssel/wang/train-all.csv') s0t, s1t, yt, _, grt = load_set('data/anssel/wang/dev.csv', vocab) else: s0, s1, y, vocab, gr = load_set('data/anssel/yodaqa/curatedv1-training.csv') s0t, s1t, yt, _, grt = load_set('data/anssel/yodaqa/curatedv1-val.csv', vocab) kwargs = eval('dict(' + args.params + ')') model = prep_model(glove, vocab, oact='linear', **kwargs) model.compile(loss={'score': ranknet}, optimizer='adam') # for 'binary_crossentropy', drop the custom oact model.fit(gr, validation_data=grt, callbacks=[AnsSelCB(s0t, grt), ModelCheckpoint('weights-cnn-bestval.h5', save_best_only=True, monitor='mrr', mode='max')], batch_size=160, nb_epoch=8) model.save_weights('weights-cnn-final.h5', overwrite=True) ev.eval_anssel(model.predict(gr)['score'][:,0], s0, y, 'Train') ev.eval_anssel(model.predict(grt)['score'][:,0], s0t, yt, 'Val')
layer=Dense(1, W_regularizer=l2(l2reg))) model.add_node(name='outS', input='out', layer=Activation('sigmoid')) model.add_output(name='score', input='outS') return model if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark kst1503 on binary classification / point ranking task (anssel-yodaqa)") parser.add_argument("-N", help="GloVe dim", type=int, default=300) parser.add_argument("--balance", help="whether to manually balance the dataset", type=int, default=1) parser.add_argument("--wang", help="whether to run on Wang inst. of YodaQA dataset", type=int, default=0) args = parser.parse_args() glove = emb.GloVe(N=args.N) if args.wang == 1: Xtrain, ytrain = load_set(glove, 'data/anssel/wang/train-all.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'data/anssel/wang/test.csv', subsample0=1) else: Xtrain, ytrain = load_set(glove, 'data/anssel/yodaqa/curatedv1-training.csv', balance=(args.balance == 1)) Xtest, ytest = load_set(glove, 'data/anssel/yodaqa/curatedv1-val.csv', subsample0=1) model = prep_model(glove) model.compile(loss={'score': 'binary_crossentropy'}, optimizer='adam') model.fit({'e0': Xtrain[0], 'e1': Xtrain[1], 'score': ytrain}, batch_size=20, nb_epoch=2000, validation_data={'e0': Xtest[0], 'e1': Xtest[1], 'score': ytest}) ev.eval_anssel(model.predict({'e0': Xtrain[0], 'e1': Xtrain[1]})['score'][:, 0], Xtrain[0], ytrain, 'Train') ev.eval_anssel(model.predict({'e0': Xtest[0], 'e1': Xtest[1]})['score'][:, 0], Xtest[0], ytest, 'Test')
if args.wang == 1: s0, s1, y, vocab, gr = load_set('data/anssel/wang/train-all.csv', glove) s0t, s1t, yt, _, grt = load_set('data/anssel/wang/dev.csv', glove, vocab) else: s0, s1, y, vocab, gr = load_set( 'data/anssel/yodaqa/curatedv1-training.csv', glove) s0t, s1t, yt, _, grt = load_set('data/anssel/yodaqa/curatedv1-val.csv', glove, vocab) kwargs = eval('dict(' + args.params + ')') model = prep_model(glove, vocab, oact='linear', **kwargs) model.compile( loss={'score': ranknet}, optimizer='adam') # for 'binary_crossentropy', drop the custom oact model.fit(gr, validation_data=grt, callbacks=[ AnsSelCB(s0t, grt), ModelCheckpoint('weights-cnn-bestval.h5', save_best_only=True, monitor='mrr', mode='max') ], batch_size=160, nb_epoch=8) model.save_weights('weights-cnn-final.h5', overwrite=True) ev.eval_anssel(model.predict(gr)['score'][:, 0], s0, s1, y, 'Train') ev.eval_anssel(model.predict(grt)['score'][:, 0], s0t, s1t, yt, 'Val')
parser = argparse.ArgumentParser(description="Benchmark CNN on a bipartite ranking task (answer selection)") parser.add_argument("-N", help="GloVe dim", type=int, default=300) parser.add_argument("--wang", help="whether to run on Wang inst. of YodaQA dataset", type=int, default=0) parser.add_argument("--params", help="additional training parameters", type=str, default="") args = parser.parse_args() glove = emb.GloVe(N=args.N) if args.wang == 1: s0, s1, y, vocab, gr = load_set("data/anssel/wang/train-all.csv") s0t, s1t, yt, _, grt = load_set("data/anssel/wang/dev.csv", vocab) else: s0, s1, y, vocab, gr = load_set("data/anssel/yodaqa/curatedv1-training.csv") s0t, s1t, yt, _, grt = load_set("data/anssel/yodaqa/curatedv1-val.csv", vocab) kwargs = eval("dict(" + args.params + ")") model = prep_model(glove, vocab, oact="linear", **kwargs) model.compile(loss={"score": ranknet}, optimizer="adam") # for 'binary_crossentropy', drop the custom oact model.fit( gr, validation_data=grt, callbacks=[ AnsSelCB(s0t, grt), ModelCheckpoint("weights-cnn-bestval.h5", save_best_only=True, monitor="mrr", mode="max"), ], batch_size=160, nb_epoch=8, ) model.save_weights("weights-cnn-final.h5", overwrite=True) ev.eval_anssel(model.predict(gr)["score"][:, 0], s0, s1, y, "Train") ev.eval_anssel(model.predict(grt)["score"][:, 0], s0t, s1t, yt, "Val")