regressor = LinearRegression() # Used with default setting here. However, in the real DKPro system, its setting # were probably optmized by a CV gridsearch on the training data # TODO: this approach is brain dead, because it keeps reading features from files print "{:64s}\t".format("Features:"), print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs]) for feat in feats: print "{:64s}\t".format(feat), for train_id, test_id in id_pairs: train_feat, train_scores = read_train_data(train_id, [feat]) regressor.fit(train_feat, train_scores) test_feat, test_scores = read_test_data(test_id, [feat]) sys_scores = regressor.predict(test_feat) sys_input = read_system_input(test_input_fnames[test_id]) postprocess(sys_input, sys_scores) if isinstance(train_id, tuple): train_id = "+".join(train_id) print "{:16.2f}\t".format(correlation(sys_scores, test_scores)), print
def score_stub(true, pred): return correlation(true, postprocess(test_input_val, pred))
filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs: # combine 2012 training and test data X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test]) y_train = np.hstack([y_sts12_train, y_sts12_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts13_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts13_test_id]) postprocess(test_input, y_test) fname = "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STScore-{}-{}.description.txt".format( out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname) filenames = " ".join(filenames) zipfile = "STScore-{}-{}.zip".format(GROUP, APPROACH) call("zip -rv {} {}".format(zipfile, filenames), shell=True)
for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname) filenames = " ".join(filenames) zipfile = "STS-en-{}-{}.zip".format(GROUP, APPROACH) call("zip -rv {} {}".format(zipfile, filenames), shell=True)