Example #1
0
# learning algorithms, one per test set, where SVR settings result from
# grid-search.sh
#regressors = [
    #SVR(C=50,  epsilon=0.2, gamma=0.02),
    #SVR(C=200, epsilon=0.5, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=10,  epsilon=0.5, gamma=0.02)
    #]

regressors = [SVR() for i in range(5)]

total = 0.0

for (train_id, test_id), regressor in zip(id_pairs, regressors):
    train_feat, train_scores = read_train_data(train_id, feats)
    regressor.fit(train_feat, train_scores)
    
    test_feat, test_scores = read_test_data(test_id, feats)
    sys_scores = regressor.predict(test_feat)
    
    sys_input = read_system_input(test_input_fnames[test_id])
    postprocess(sys_input,  sys_scores)
    
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)

    r = correlation(sys_scores, test_scores)
    total += r
    
    print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r)
Example #2
0
regressor = LinearRegression()
# Used with default setting here. However, in the real DKPro system, its setting
# were probably optmized by a CV gridsearch on the training data


# TODO: this approach is brain dead, because it keeps reading features from files

print "{:64s}\t".format("Features:"),

print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs])


for feat in feats:
    print "{:64s}\t".format(feat),
    
    for train_id, test_id in id_pairs:
        train_feat, train_scores = read_train_data(train_id, [feat])
        regressor.fit(train_feat, train_scores)
        
        test_feat, test_scores = read_test_data(test_id, [feat])
        sys_scores = regressor.predict(test_feat)
        
        sys_input = read_system_input(test_input_fnames[test_id])
        postprocess(sys_input,  sys_scores)
        
        if isinstance(train_id, tuple):
            train_id = "+".join(train_id)
        
        print "{:16.2f}\t".format(correlation(sys_scores, test_scores)),
    print
    
Example #3
0
# grid-search.sh
#regressors = [
    #SVR(C=50,  epsilon=0.2, gamma=0.02),
    #SVR(C=200, epsilon=0.5, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=100, epsilon=0.2, gamma=0.02),
    #SVR(C=10,  epsilon=0.5, gamma=0.02)
    #]

regressors = [SVR(C=200) for i in range(5)]

total = 0.0


for (train_id, test_id), regressor in zip(id_pairs, regressors):
    train_feat, train_scores = read_train_data(train_id, feats)
    regressor.fit(train_feat, train_scores)
    
    test_feat, test_scores = read_test_data(test_id, feats)
    sys_scores = regressor.predict(test_feat)
    
    sys_input = read_system_input(test_input_fnames[test_id])
    postprocess(sys_input,  sys_scores)
    
    if isinstance(train_id, tuple):
        train_id = "+".join(train_id)
    
    r = correlation(sys_scores, test_scores)
    total += r
    
    print "{:32s} {:32s} {:2.4f}".format(train_id, test_id, r)
Example #4
0
    "MSRpar": "MSRpar",
    "MSRvid": "MSRvid",
    "SMTeuroparl": "SMTeuroparl",
    "surprise.SMTnews": "SMTeuroparl",
    "surprise.OnWN": "MSRpar"
}

# read all training and test data
X_train = {}
y_train = {}
X_test = {}
y_test = {}
sys_input = {}

for train_id in regressors.keys():
    X_train[train_id], y_train[train_id] = read_train_data(train_id, feats)

for test_id in test_id2train_id.keys():
    X_test[test_id], y_test[test_id] = read_test_data(test_id, feats)
    sys_input[test_id] = read_system_input(test_input_fnames[test_id])

# read headlines
n = 25
print "number headline parts:", n
hline_fnames = glob.glob("_npz_data/_hlines_part_???.npz")[:n]

collect = []
for fn in hline_fnames:
    fh = open(fn)
    collect.append(np.load(fh)["X"])
    fh.close()
Example #5
0
    "MSRpar":           "MSRpar",
    "MSRvid":           "MSRvid",
    "SMTeuroparl":      "SMTeuroparl",
    "surprise.SMTnews": "SMTeuroparl",
    "surprise.OnWN":    "MSRpar" 
}

# read all training and test data
X_train = {}
y_train = {}
X_test = {}
y_test = {}
sys_input = {}

for train_id in regressors.keys():
    X_train[train_id], y_train[train_id] = read_train_data(train_id, feats)
    
for test_id in test_id2train_id.keys():
    X_test[test_id], y_test[test_id] = read_test_data(test_id, feats)
    sys_input[test_id] = read_system_input(test_input_fnames[test_id])

# read headlines
n = 25
print "number headline parts:", n
hline_fnames = glob.glob("_npz_data/_hlines_part_???.npz")[:n] 

collect = []
for fn in hline_fnames:
    fh = open(fn)
    collect.append(np.load(fh)["X"])
    fh.close()
Example #6
0
     "headlines"),
    ("SMTeuroparl",
     ("SMTeuroparl", "surprise.SMTnews"),
     "SMT"),
    (train_ids,
     test_ids,
     "FNWN"),
    (train_ids,
     test_ids,
     "OnWN") ]

feats = takelab_feats + takelab_lsa_feats + subsem_best_feats

scores = []

X_sts12_train, y_sts12_train = read_train_data(train_ids, feats)
X_sts12_test, y_sts12_test = read_test_data(test_ids, feats)
X_train = vstack([X_sts12_train, X_sts12_test])
y_train = hstack([y_sts12_train, y_sts12_test])

test_input = [read_system_input(test_input_fnames[sts13_test_id]) for sts13_test_id in sts13.test_ids]
test_input = concatenate(test_input)

X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats)

X_sts13_val = X_sts13[0:X_sts13.shape[0]/2, :]
X_sts13_held = X_sts13[X_sts13.shape[0]/2:, :]

y_sts_val = y_sts13[0:len(y_sts13)/2]
y_sts_held = y_sts13[len(y_sts13)/2:]
Example #7
0
# features to be used
# feats = all_feats
feats = dkpro_feats + takelab_feats + takelab_lsa_feats + gleb_feats

# learning algorithm in default setting
regressor = SVR()

out_dir = "STScore-{}-{}".format(GROUP, APPROACH)
if not exists(out_dir): mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs:
    # combine 2012 training and test data 
    X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test])
    
    regressor.fit(X_train, y_train)
    
    X_test = read_blind_test_data(sts13_test_id, feats)
    y_test = regressor.predict(X_test)
    
    test_input = read_system_input(test_input_fnames[sts13_test_id])
    postprocess(test_input,  y_test)
    
    fname =  "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)
Example #8
0
# features to be used
feats = takelab_feats + takelab_lsa_feats + subsem_best_feats

# learning algorithm in default setting
regressor = BaggingRegressor(SVR(), verbose=1, n_jobs=3, n_estimators=100, max_features=0.8, max_samples=0.8)


out_dir = "STS-en-{}-{}".format(GROUP, APPROACH)
if not os.path.exists(out_dir): os.mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs:
    # combine 2012, 2013 training and test data
    X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats)
    X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts14_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts14_test_id])
    postprocess(test_input,  y_test)

    fname =  "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id)
    write_scores(fname, y_test)
Example #9
0
# features to be used
# feats = all_feats
feats = dkpro_feats + takelab_feats + takelab_lsa_feats + gleb_feats

# learning algorithm in default setting
regressor = SVR()

out_dir = "STScore-{}-{}".format(GROUP, APPROACH)
if not exists(out_dir): mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs:
    # combine 2012 training and test data
    X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts13_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts13_test_id])
    postprocess(test_input, y_test)

    fname = "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)
Example #10
0
# learning algorithm
#regressor = SVR()
regressor = LinearRegression()

# TODO: this approach is brain dead, because it keeps reading features from files

print "{:64s}\t".format("Features:"),

print " ".join(["{:>16s}\t".format(p[2]) for p in id_pairs])

for feat in feats:
    print "{:64s}\t".format(feat),

    for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs:
        # combine 2012 training and test data
        X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, [feat])
        X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, [feat])
        X_train = np.vstack([X_sts12_train, X_sts12_test])
        y_train = np.hstack([y_sts12_train, y_sts12_test])

        regressor.fit(X_train, y_train)

        X_test, y_test = sts13.read_test_data(sts13_test_id, [feat])
        sys_scores = regressor.predict(X_test)

        sys_input = read_system_input(test_input_fnames[sts13_test_id])
        postprocess(sys_input, sys_scores)

        print "{:16.2f}\t".format(correlation(sys_scores, y_test)),
    print
Example #11
0
regressor = LinearRegression()


# TODO: this approach is brain dead, because it keeps reading features from files

print "{:64s}\t".format("Features:"),

print " ".join(["{:>16s}\t".format(p[2]) for p in id_pairs])


for feat in feats:
    print "{:64s}\t".format(feat),
    
    for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs:
        # combine 2012 training and test data 
        X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, [feat])
        X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, [feat])
        X_train = np.vstack([X_sts12_train, X_sts12_test])
        y_train = np.hstack([y_sts12_train, y_sts12_test])
    
        regressor.fit(X_train, y_train)
        
        X_test, y_test = sts13.read_test_data(sts13_test_id, [feat])
        sys_scores = regressor.predict(X_test)
        
        sys_input = read_system_input(test_input_fnames[sts13_test_id])
        postprocess(sys_input,  sys_scores)
        
        print "{:16.2f}\t".format(correlation(sys_scores, y_test)),
    print
    
Example #12
0
# learning algorithm in default setting
regressor = BaggingRegressor(SVR(),
                             verbose=1,
                             n_jobs=3,
                             n_estimators=100,
                             max_features=0.8,
                             max_samples=0.8)

out_dir = "STS-en-{}-{}".format(GROUP, APPROACH)
if not os.path.exists(out_dir): os.mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs:
    # combine 2012, 2013 training and test data
    X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(
        sts12_train_id, feats)
    X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(
        sts12_test_id, feats)
    X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts14_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts14_test_id])
    postprocess(test_input, y_test)

    fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id)
Example #13
0
FIXED_C_GRID = {
    'C': [1],
    'epsilon': [0.1, 0.3, 1, 3],
    'gamma': [0.0, 0.01, 0.03, 0.1, 0.3, 1]
}

id_pairs = [(train_ids, test_ids, "headlines"),
            ("SMTeuroparl", ("SMTeuroparl", "surprise.SMTnews"), "SMT"),
            (train_ids, test_ids, "FNWN"), (train_ids, test_ids, "OnWN")]

feats = takelab_feats + takelab_lsa_feats + subsem_best_feats

scores = []

X_sts12_train, y_sts12_train = read_train_data(train_ids, feats)
X_sts12_test, y_sts12_test = read_test_data(test_ids, feats)
X_train = vstack([X_sts12_train, X_sts12_test])
y_train = hstack([y_sts12_train, y_sts12_test])

test_input = [
    read_system_input(test_input_fnames[sts13_test_id])
    for sts13_test_id in sts13.test_ids
]
test_input = concatenate(test_input)

X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats)

X_sts13_val = X_sts13[0:X_sts13.shape[0] / 2, :]
X_sts13_held = X_sts13[X_sts13.shape[0] / 2:, :]