def _load_harness_multipack(hconf, test_data=False): """Load the multipack for our current configuration. Load the stripped features file if we don't actually need to use the features (this would only make sense on the cluster where evaluation is broken up into separate stages that we can fire on different nodes) Parameters ---------- test_data : bool, defaults to False If True, it's test data we wanted. Returns ------- mpack : Multipack Multipack loaded from the harness' configuration. """ stripped_paths = hconf.mpack_paths(test_data, stripped=True) if (hconf.runcfg.stage in [ClusterStage.end, ClusterStage.start] and fp.exists(stripped_paths['features'])): paths = stripped_paths else: paths = hconf.mpack_paths(test_data, stripped=False) mpack = load_multipack( paths['edu_input'], paths['pairings'], paths['features'], paths['vocab'], corpus_path=paths.get('corpus', None), # WIP verbose=True) return mpack
def _load_harness_multipack(hconf, test_data=False): """ Load the multipack for our current configuration. Load the stripped features file if we don't actually need to use the features (this would only make sense on the cluster where evaluation is broken up into separate stages that we can fire on different nodes) Parameters ---------- test_data: bool Returns ------- mpack: Multipack """ stripped_paths = hconf.mpack_paths(test_data, stripped=True) if (hconf.runcfg.stage in [ClusterStage.end, ClusterStage.start] and fp.exists(stripped_paths[2])): paths = stripped_paths else: paths = hconf.mpack_paths(test_data, stripped=False) return load_multipack(paths[0], paths[1], paths[2], paths[3], verbose=True)
def load_args_multipack(args): ''' Load multipack specified via command line arguments ''' return load_multipack(args.edus, args.pairings, args.features, args.vocab, verbose=not args.quiet)
def _do_corpus(hconf): "Run evaluation on a corpus" paths = hconf.mpack_paths(test_data=False) if not fp.exists(paths[0]): exit_ungathered() mpack = load_multipack(paths[0], paths[1], paths[2], paths[3], verbose=True) dconf = DataConfig(pack=mpack, folds=None) # (re)learn combined model (we shouldn't assume # it's in some scratch directory) for econf in hconf.evaluations: learn(hconf, econf, dconf, None) _mk_dialogue_act_model(hconf)
def decode(lconf, evaluations): "Decode the input using all the model/learner combos we know" fpath = minicorpus_path(lconf) + '.relations.sparse' vocab_path = lconf.mpack_paths(test_data=False)[3] mpack = load_multipack(fpath + '.edu_input', fpath + '.pairings', fpath, vocab_path) decoder_jobs = concat_i(_get_decoding_jobs(mpack, lconf, econf) for econf in evaluations) Parallel(n_jobs=lconf.runcfg.n_jobs, verbose=True)(decoder_jobs) for econf in evaluations: output_path = attelo_result_path(lconf, econf) ath_parse.concatenate_outputs(mpack, output_path)
def _do_corpus(hconf): "Run evaluation on a corpus" paths = hconf.mpack_paths(test_data=False) if not fp.exists(paths['edu_input']): exit_ungathered() mpack = load_multipack(paths['edu_input'], paths['pairings'], paths['features'], paths['vocab'], verbose=True) dconf = DataConfig(pack=mpack, folds=None) # (re)learn combined model (we shouldn't assume # it's in some scratch directory) for econf in hconf.evaluations: learn(hconf, econf, dconf, None) _mk_dialogue_act_model(hconf)
def decode(lconf, evaluations): """Decode the input using all the model/learner combos we know. Parameters ---------- lconf : ? TODO evaluations : iterable of ? TODO """ fpath = minicorpus_path(lconf) + '.relations.sparse' vocab_path = lconf.mpack_paths(test_data=False)['vocab'] mpack = load_multipack(fpath + '.edu_input', fpath + '.pairings', fpath, vocab_path) decoder_jobs = concat_i( _get_decoding_jobs(mpack, lconf, econf) for econf in evaluations) Parallel(n_jobs=lconf.runcfg.n_jobs, verbose=True)(decoder_jobs) for econf in evaluations: output_path = attelo_result_path(lconf, econf) ath_parse.concatenate_outputs(mpack, output_path)
from attelo.score import (score_edges) from attelo.table import (DataPack) from attelo.util import (mk_rng, Team) # pylint: disable=invalid-name WORKING_DIR = 'doc/example-corpus' PREFIX = fp.join(WORKING_DIR, 'tiny') TMP_OUTPUT = '/tmp/mini-evaluate' if not fp.exists(TMP_OUTPUT): os.makedirs(TMP_OUTPUT) # load the data mpack = load_multipack(PREFIX + '.edus', PREFIX + '.pairings', PREFIX + '.features.sparse', PREFIX + '.features.sparse.vocab', verbose=True) # divide the dataset into folds num_folds = min((10, len(mpack))) fold_dict = make_n_fold(mpack, num_folds, mk_rng()) # select a decoder and a learner team decoder = MstDecoder(root_strategy=MstRootStrategy.fake_root) learners = Team(attach=SklearnAttachClassifier(LogisticRegression()), label=SklearnLabelClassifier(LogisticRegression())) # put them together as a parser parser = JointPipeline(learner_attach=learners.attach, learner_label=learners.label,
def main(args): """ Load tiny corpus, extract features and labels, and learn the presence of semantic relation Parameters ---------- args : not used yet (but necessary) """ # load the data into a multipack mpackTrain = load_multipack(TRAIN_PREFIX + '.relations.sparse.edu_input', TRAIN_PREFIX + '.relations.sparse.pairings', TRAIN_PREFIX + '.relations.sparse', TRAIN_PREFIX + '.relations.sparse.vocab', verbose=True) mpackTest = load_multipack(TEST_PREFIX + '.relations.sparse.edu_input', TEST_PREFIX + '.relations.sparse.pairings', TEST_PREFIX + '.relations.sparse', TEST_PREFIX + '.relations.sparse.vocab', verbose=True) modelRidge = linear_model.Ridge modelLogistic = linear_model.LogisticRegression modelLinear = linear_model.LinearRegression modelRandomForestRegressor = ensemble.RandomForestRegressor modelGradientBoost = ensemble.GradientBoostingClassifier modelGradientBoostRegressor = ensemble.GradientBoostingRegressor decoderMST = MstDecoder(MstRootStrategy.fake_root, use_prob=False) decoder = ILPDecoder() label = collectTargetLabels(mpackTest) LAM = np.logspace(np.log10(6e-2), np.log10(.5), num=5) for lam in LAM: print("------------------------------------------------------") print("Lambda =", lam) # create learner learner = Learner(decoder=decoderMST, decoder2=decoder, Lambda=lam) s, w, dualGap = learner.fit(mpackTrain) arbre = predict(learner, mpackTrain.values()[5], w) print() print(arbre.graph[0]) dpack2 = learner.multiply(mpackTrain.values()[5], attach=1 * (mpackTrain.values()[5].target != 2)) targetpack = learner.decoder_.decode(dpack2) target = targetpack.graph[0] print(target) print(arbre.target) plt.plot(s, label="score") plt.plot(dualGap, label="duality gap") plt.plot(np.asarray(s) - np.asarray(dualGap), label="difference") plt.title("minimization") plt.xlabel('k') plt.ylabel('score') plt.grid(True) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + 'test'+str(round(lam, 5))+'.pdf') plt.close() np.savetxt('w_'+str(round(lam, 5))+'.out', w, delimiter=',') # test = np.loadtxt('test.out', delimiter=',') print() res = learner.predict_score(mpackTest) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote = [] vote.append(1 * pred) print("f1 score =", round(metrics.f1_score(label, 1 * pred), 2)) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) print("Precision =", round(P, 2)) print("Recall =", round(R, 2)) print() message("IRIT-STAC", "Travail termine") exit(1) """ learner = SklearnAttachClassifier(modelLogistic()) decoder = MstDecoder(MstRootStrategy.fake_root) parser1 = AttachPipeline(learner=learner, decoder=decoder) # train the parser train_dpacks = mpackTrain.values() train_targets = [x.target for x in train_dpacks] parser1.fit(train_dpacks, train_targets) # now run on a test pack dpack = [] for i in range(len(mpackTest.values())): dpack.extend(parser1.transform(mpackTest.values()[i])) # print_results(dpack)""" """ # Clean features with high correlation or low variance featsTrain_filtered_1, featsTest_filtered_1 = cleanFeatures(featsTrain, featsTest, fileName=FIGURES_FOLDER + 'minFilter.pdf') featsTrain_filtered_2, featsTest_filtered_2 = cleanFeatures(featsTrain, featsTest, maxCorrelation=0.98, minVariance=0.005, fileName=FIGURES_FOLDER + 'maxFilter.pdf') featsTrain_filtered_3, featsTest_filtered_3 = cleanFeatures(featsTrain, featsTest, maxCorrelation=0.97, fileName=FIGURES_FOLDER + 'filter.pdf')""" # getAUPR(mpackTrain, mpackTest, modelRidge, 1e3, 1e4, 50, fileName='noFilter_ridge_2_50pts.pdf', folder=FIGURES_FOLDER) # exit(1) # getAUPR(mpackTrain, mpackTest, modelLogistic, 1e-3, 1e-1, 50, fileName='noFilter_3_50pts.pdf', folder=FIGURES_FOLDER, param='C', proba=True) # exit(1) ######################################################################## with Torpor("Learning and Scoring Linear Regression Ridge"): learner = Learner(learner=modelRidge, alpha=3500) learner.fit(mpackTrain) scores = learner.predict_score(mpackTest) label = collectTargetLabels(mpackTest) precision, recall, seuil = metrics.precision_recall_curve(label, scores) f1_score = [0] for s in seuil: f1 = metrics.f1_score(label, 1 * (scores > s)) f1_score.append(f1) precision = [x for (y, x) in sorted(zip(recall, precision))] f1_score = [x for (y, x) in sorted(zip(recall, f1_score))] recall.sort() # plt.plot(recall, precision) area = metrics.auc(recall, precision) print("Area =", area) plt.plot(recall, precision, label="PR") plt.plot(recall, f1_score, label="F1 score") precision = np.asarray(precision) recall = np.asarray(recall) print("max f1_score =", max(f1_score)) print("max precision =", precision[np.argwhere(f1_score == max(f1_score))]) print("max recall =", recall[np.argwhere(f1_score == max(f1_score))]) res = learner.predict_score(mpackTest, decoder=decoder) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote = [] vote.append(1 * pred) plt.title("Linear Regression Ridge AUPR=0.41\n F1 score with decoder :" + str( round(metrics.f1_score(label, 1 * pred), 2))) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) R_not_decoded = min(recall, key=lambda x: abs(x - R)) P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)]) print("Precision =", P) print("Recall =", R) print("Precision before decoding =", P_not_decoded) plt.plot(R, P, 'ro') plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7), arrowprops=dict(facecolor='black', shrink=0.02)) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + "Ridge.pdf") plt.close() ########################################################################################## with Torpor("Learning and Scoring Linear Regression"): learner._instantiateLearner(modelLinear) learner.fit(mpackTrain) scores = learner.predict_score(mpackTest) label = collectTargetLabels(mpackTest) precision, recall, seuil = metrics.precision_recall_curve(label, scores) f1_score = [0] for s in seuil: f1_score.extend([metrics.f1_score(label, 1 * (scores > s))]) precision = [x for (y, x) in sorted(zip(recall, precision))] f1_score = [x for (y, x) in sorted(zip(recall, f1_score))] recall.sort() # plt.plot(recall, precision) area = metrics.auc(recall, precision) print("Area =", area) plt.plot(recall, precision, label="PR") plt.plot(recall, f1_score, label="F1 score") precision = np.asarray(precision) recall = np.asarray(recall) print("max f1_score =", max(f1_score)) print("max precision =", precision[np.argwhere(f1_score == max(f1_score))]) print("max recall =", recall[np.argwhere(f1_score == max(f1_score))]) res = learner.predict_score(mpackTest, decoder=decoder) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote.append(1 * pred) plt.title( "Linear Regression AUPR=0.33\n F1 score with decoder :" + str(round(metrics.f1_score(label, 1 * pred), 2))) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) R_not_decoded = min(recall, key=lambda x: abs(x - R)) P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)]) print("Precision =", P) print("Recall =", R) print("Precision before decoding =", P_not_decoded) plt.plot(R, P, 'ro') plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7), arrowprops=dict(facecolor='black', shrink=0.02)) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + "RegLinear.pdf") plt.close() ############################################################################################# with Torpor("Learning and Scoring Logistic Regression"): learner = Learner(learner=modelLogistic) learner.fit(mpackTrain) scores = learner.predict_score(mpackTest, proba=True) label = collectTargetLabels(mpackTest) precision, recall, seuil = metrics.precision_recall_curve(label, scores) f1_score = [0] for s in seuil: f1_score.extend([metrics.f1_score(label, 1 * (scores > s))]) precision = [x for (y, x) in sorted(zip(recall, precision))] f1_score = [x for (y, x) in sorted(zip(recall, f1_score))] recall.sort() # plt.plot(recall, precision) area = metrics.auc(recall, precision) print("Area =", area) plt.plot(recall, precision, label="PR") plt.plot(recall, f1_score, label="F1 score") precision = np.asarray(precision) recall = np.asarray(recall) print("max f1_score =", max(f1_score)) print("max precision =", precision[np.argwhere(f1_score == max(f1_score))]) print("max recall =", recall[np.argwhere(f1_score == max(f1_score))]) res = learner.predict_score(mpackTest, decoder=decoderMST, proba=True) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote.append(1 * pred) plt.title( "Logistic Regression AUPR=0.33\n F1 score with decoder :" + str( round(metrics.f1_score(label, 1 * pred), 2))) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) R_not_decoded = min(recall, key=lambda x: abs(x - R)) P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)]) print("Precision =", P) print("Recall =", R) print("Precision before decoding =", P_not_decoded) plt.plot(R, P, 'ro') plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7), arrowprops=dict(facecolor='black', shrink=0.02)) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + "RegLogistic.pdf") plt.close() ######################################################################################### with Torpor("Learning and Scoring Gradient Boost Regression"): learner._instantiateLearner(modelGradientBoostRegressor) learner.fit(mpackTrain) scores = learner.predict_score(mpackTest, dense=True) precision, recall, seuil = metrics.precision_recall_curve(label, scores) f1_score = [0] for s in seuil: f1_score.extend([metrics.f1_score(label, 1 * (scores > s))]) precision = [x for (y, x) in sorted(zip(recall, precision))] f1_score = [x for (y, x) in sorted(zip(recall, f1_score))] recall.sort() # plt.plot(recall, precision) area = metrics.auc(recall, precision) print("Area =", area) plt.plot(recall, precision, label="PR") plt.plot(recall, f1_score, label="F1 score") precision = np.asarray(precision) recall = np.asarray(recall) print("max f1_score =", max(f1_score)) print("max precision =", precision[np.argwhere(f1_score == max(f1_score))]) print("max recall =", recall[np.argwhere(f1_score == max(f1_score))]) res = learner.predict_score(mpackTest, decoder=decoder, dense=True) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote.append(1 * pred) plt.title("Gradient Boost Regression AUPR=0.69\n F1 score with decoder :" + str( round(metrics.f1_score(label, 1 * pred), 2))) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) R_not_decoded = min(recall, key=lambda x: abs(x - R)) P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)]) print("Precision =", P) print("Recall =", R) print("Precision before decoding =", P_not_decoded) plt.plot(R, P, 'ro') plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7), arrowprops=dict(facecolor='black', shrink=0.02)) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + "GradientBoost.pdf") plt.close() #################################################################################################### with Torpor("Learning and Scoring Random Forest Regression"): learner = Learner(learner=modelRandomForestRegressor, n_estimators=20) learner.fit(mpackTrain) scores = learner.predict_score(mpackTest, dense=True) label = collectTargetLabels(mpackTest) precision, recall, seuil = metrics.precision_recall_curve(label, scores) print("len(seuil) =", len(seuil)) print("seuil =", seuil) f1_score = [0] for s in seuil: f1_score.append(metrics.f1_score(label, 1 * (scores > s))) precision = [x for (y, x) in sorted(zip(recall, precision))] f1_score = [x for (y, x) in sorted(zip(recall, f1_score))] recall.sort() # plt.plot(recall, precision) area = metrics.auc(recall, precision) print("Area =", area) plt.plot(recall, precision, label="PR") plt.plot(recall, f1_score, label="F1 score") precision = np.asarray(precision) recall = np.asarray(recall) print("max f1_score =", max(f1_score)) print("max precision =", precision[np.argwhere(f1_score == max(f1_score))]) print("max recall =", recall[np.argwhere(f1_score == max(f1_score))]) res = learner.predict_score(mpackTest, decoder=decoderMST, dense=True) pred = [] for graph in res: pred.extend(graph.graph[0] != 2) vote.append(1 * pred) plt.title("Random Forest Regression AUPR=0.72\n F1 score with decoder :" + str( round(metrics.f1_score(label, 1 * pred), 2))) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) R_not_decoded = min(recall, key=lambda x: abs(x - R)) P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)]) print("Precision =", P) print("Recall =", R) print("Precision before decoding =", P_not_decoded) plt.plot(R, P, 'ro') plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7), arrowprops=dict(facecolor='black', shrink=0.02)) plt.legend(loc=1, borderaxespad=0.) plt.savefig(FIGURES_FOLDER + "RandomForest.pdf") plt.close() # print("res =", res) pred = 1 * (np.mean(vote, axis=0) >= 0.4) print("Vote majorite\nf1_score =", round(metrics.f1_score(label, 1 * pred), 2)) P = metrics.precision_score(label, pred) R = metrics.recall_score(label, pred) print("Precision =", P) print("Recall =", R) message("IRIT-STAC", "Travail termine") """