#SVR(C=50, epsilon=0.2, gamma=0.02), #SVR(C=200, epsilon=0.5, gamma=0.02), #SVR(C=100, epsilon=0.2, gamma=0.02), #SVR(C=100, epsilon=0.2, gamma=0.02), #SVR(C=10, epsilon=0.5, gamma=0.02) #] regressors = [SVR() for i in range(5)] total = 0.0 for (train_id, test_id), regressor in zip(id_pairs, regressors): train_feat, train_scores = read_train_data(train_id, feats) regressor.fit(train_feat, train_scores) test_feat, test_scores = read_test_data(test_id, feats) sys_scores = regressor.predict(test_feat) sys_input = read_system_input(test_input_fnames[test_id]) postprocess(sys_input, sys_scores) if isinstance(train_id, tuple): train_id = "+".join(train_id) r = correlation(sys_scores, test_scores) total += r print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r) print "{:2.4f}".format(total / len(id_pairs))
regressor = LinearRegression() # Used with default setting here. However, in the real DKPro system, its setting # were probably optmized by a CV gridsearch on the training data # TODO: this approach is brain dead, because it keeps reading features from files print "{:64s}\t".format("Features:"), print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs]) for feat in feats: print "{:64s}\t".format(feat), for train_id, test_id in id_pairs: train_feat, train_scores = read_train_data(train_id, [feat]) regressor.fit(train_feat, train_scores) test_feat, test_scores = read_test_data(test_id, [feat]) sys_scores = regressor.predict(test_feat) sys_input = read_system_input(test_input_fnames[test_id]) postprocess(sys_input, sys_scores) if isinstance(train_id, tuple): train_id = "+".join(train_id) print "{:16.2f}\t".format(correlation(sys_scores, test_scores)), print
#SVR(C=200, epsilon=0.5, gamma=0.02), #SVR(C=100, epsilon=0.2, gamma=0.02), #SVR(C=100, epsilon=0.2, gamma=0.02), #SVR(C=10, epsilon=0.5, gamma=0.02) #] regressors = [SVR(C=200) for i in range(5)] total = 0.0 for (train_id, test_id), regressor in zip(id_pairs, regressors): train_feat, train_scores = read_train_data(train_id, feats) regressor.fit(train_feat, train_scores) test_feat, test_scores = read_test_data(test_id, feats) sys_scores = regressor.predict(test_feat) sys_input = read_system_input(test_input_fnames[test_id]) postprocess(sys_input, sys_scores) if isinstance(train_id, tuple): train_id = "+".join(train_id) r = correlation(sys_scores, test_scores) total += r print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r) print "{:2.4f}".format(total / len(id_pairs))
"surprise.SMTnews": "SMTeuroparl", "surprise.OnWN": "MSRpar" } # read all training and test data X_train = {} y_train = {} X_test = {} y_test = {} sys_input = {} for train_id in regressors.keys(): X_train[train_id], y_train[train_id] = read_train_data(train_id, feats) for test_id in test_id2train_id.keys(): X_test[test_id], y_test[test_id] = read_test_data(test_id, feats) sys_input[test_id] = read_system_input(test_input_fnames[test_id]) # read headlines n = 25 print "number headline parts:", n hline_fnames = glob.glob("_npz_data/_hlines_part_???.npz")[:n] collect = [] for fn in hline_fnames: fh = open(fn) collect.append(np.load(fh)["X"]) fh.close() X_hline = np.vstack(collect) #X_hline = np.vstack([np.load(fn)["X"] for fn in hline_fnames])
("SMTeuroparl", ("SMTeuroparl", "surprise.SMTnews"), "SMT"), (train_ids, test_ids, "FNWN"), (train_ids, test_ids, "OnWN") ] feats = takelab_feats + takelab_lsa_feats + subsem_best_feats scores = [] X_sts12_train, y_sts12_train = read_train_data(train_ids, feats) X_sts12_test, y_sts12_test = read_test_data(test_ids, feats) X_train = vstack([X_sts12_train, X_sts12_test]) y_train = hstack([y_sts12_train, y_sts12_test]) test_input = [read_system_input(test_input_fnames[sts13_test_id]) for sts13_test_id in sts13.test_ids] test_input = concatenate(test_input) X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats) X_sts13_val = X_sts13[0:X_sts13.shape[0]/2, :] X_sts13_held = X_sts13[X_sts13.shape[0]/2:, :] y_sts_val = y_sts13[0:len(y_sts13)/2] y_sts_held = y_sts13[len(y_sts13)/2:] test_input_val = test_input[0:len(test_input)/2]
# features to be used # feats = all_feats feats = dkpro_feats + takelab_feats + takelab_lsa_feats + gleb_feats # learning algorithm in default setting regressor = SVR() out_dir = "STScore-{}-{}".format(GROUP, APPROACH) if not exists(out_dir): mkdir(out_dir) filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs: # combine 2012 training and test data X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test]) y_train = np.hstack([y_sts12_train, y_sts12_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts13_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts13_test_id]) postprocess(test_input, y_test) fname = "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id) write_scores(fname, y_test) filenames.append(fname)
# features to be used feats = takelab_feats + takelab_lsa_feats + subsem_best_feats # learning algorithm in default setting regressor = BaggingRegressor(SVR(), verbose=1, n_jobs=3, n_estimators=100, max_features=0.8, max_samples=0.8) out_dir = "STS-en-{}-{}".format(GROUP, APPROACH) if not os.path.exists(out_dir): os.mkdir(out_dir) filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname)
# learning algorithm #regressor = SVR() regressor = LinearRegression() # TODO: this approach is brain dead, because it keeps reading features from files print "{:64s}\t".format("Features:"), print " ".join(["{:>16s}\t".format(p[2]) for p in id_pairs]) for feat in feats: print "{:64s}\t".format(feat), for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs: # combine 2012 training and test data X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, [feat]) X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, [feat]) X_train = np.vstack([X_sts12_train, X_sts12_test]) y_train = np.hstack([y_sts12_train, y_sts12_test]) regressor.fit(X_train, y_train) X_test, y_test = sts13.read_test_data(sts13_test_id, [feat]) sys_scores = regressor.predict(X_test) sys_input = read_system_input(test_input_fnames[sts13_test_id]) postprocess(sys_input, sys_scores) print "{:16.2f}\t".format(correlation(sys_scores, y_test)), print
regressor = LinearRegression() # TODO: this approach is brain dead, because it keeps reading features from files print "{:64s}\t".format("Features:"), print " ".join(["{:>16s}\t".format(p[2]) for p in id_pairs]) for feat in feats: print "{:64s}\t".format(feat), for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs: # combine 2012 training and test data X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, [feat]) X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, [feat]) X_train = np.vstack([X_sts12_train, X_sts12_test]) y_train = np.hstack([y_sts12_train, y_sts12_test]) regressor.fit(X_train, y_train) X_test, y_test = sts13.read_test_data(sts13_test_id, [feat]) sys_scores = regressor.predict(X_test) sys_input = read_system_input(test_input_fnames[sts13_test_id]) postprocess(sys_input, sys_scores) print "{:16.2f}\t".format(correlation(sys_scores, y_test)), print
verbose=1, n_jobs=3, n_estimators=100, max_features=0.8, max_samples=0.8) out_dir = "STS-en-{}-{}".format(GROUP, APPROACH) if not os.path.exists(out_dir): os.mkdir(out_dir) filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data( sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data( sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname)
FIXED_C_GRID = { 'C': [1], 'epsilon': [0.1, 0.3, 1, 3], 'gamma': [0.0, 0.01, 0.03, 0.1, 0.3, 1] } id_pairs = [(train_ids, test_ids, "headlines"), ("SMTeuroparl", ("SMTeuroparl", "surprise.SMTnews"), "SMT"), (train_ids, test_ids, "FNWN"), (train_ids, test_ids, "OnWN")] feats = takelab_feats + takelab_lsa_feats + subsem_best_feats scores = [] X_sts12_train, y_sts12_train = read_train_data(train_ids, feats) X_sts12_test, y_sts12_test = read_test_data(test_ids, feats) X_train = vstack([X_sts12_train, X_sts12_test]) y_train = hstack([y_sts12_train, y_sts12_test]) test_input = [ read_system_input(test_input_fnames[sts13_test_id]) for sts13_test_id in sts13.test_ids ] test_input = concatenate(test_input) X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats) X_sts13_val = X_sts13[0:X_sts13.shape[0] / 2, :] X_sts13_held = X_sts13[X_sts13.shape[0] / 2:, :] y_sts_val = y_sts13[0:len(y_sts13) / 2]