Ejemplo n.º 1
0
regressor = LinearRegression()
# Used with default setting here. However, in the real DKPro system, its setting
# were probably optmized by a CV gridsearch on the training data


# TODO: this approach is brain dead, because it keeps reading features from files

print "{:64s}\t".format("Features:"),

print "\t".join(["{:>16s}".format(p[1]) for p in id_pairs])


for feat in feats:
    print "{:64s}\t".format(feat),
    
    for train_id, test_id in id_pairs:
        train_feat, train_scores = read_train_data(train_id, [feat])
        regressor.fit(train_feat, train_scores)
        
        test_feat, test_scores = read_test_data(test_id, [feat])
        sys_scores = regressor.predict(test_feat)
        
        sys_input = read_system_input(test_input_fnames[test_id])
        postprocess(sys_input,  sys_scores)
        
        if isinstance(train_id, tuple):
            train_id = "+".join(train_id)
        
        print "{:16.2f}\t".format(correlation(sys_scores, test_scores)),
    print
    
Ejemplo n.º 2
0
from sts.score import correlation


train = np.load("_npz_data/_STS2012.train.MSRpar.npz")
clf = SVR(kernel='rbf', C=50, epsilon=.2, gamma=.02)
print clf

clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])
    
test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 
sys_scores[sys_scores > 5.0] = 5.0 
sys_scores[sys_scores < 0.0] = 0.0 

# compute correlation score
gold_scores = read_gold_standard("../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)


#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV
    
#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
Ejemplo n.º 3
0
     test_ids,
     "FNWN"),
    (train_ids,
     test_ids,
     "OnWN") ]

feats = takelab_feats + takelab_lsa_feats + subsem_best_feats

scores = []

X_sts12_train, y_sts12_train = read_train_data(train_ids, feats)
X_sts12_test, y_sts12_test = read_test_data(test_ids, feats)
X_train = vstack([X_sts12_train, X_sts12_test])
y_train = hstack([y_sts12_train, y_sts12_test])

test_input = [read_system_input(test_input_fnames[sts13_test_id]) for sts13_test_id in sts13.test_ids]
test_input = concatenate(test_input)

X_sts13, y_sts13 = sts13.read_test_data(sts13.test_ids, feats)

X_sts13_val = X_sts13[0:X_sts13.shape[0]/2, :]
X_sts13_held = X_sts13[X_sts13.shape[0]/2:, :]

y_sts_val = y_sts13[0:len(y_sts13)/2]
y_sts_held = y_sts13[len(y_sts13)/2:]

test_input_val = test_input[0:len(test_input)/2]
test_input_held = test_input[len(test_input)/2:]

n_train = len(y_train)
n_test = len(y_sts_val)
Ejemplo n.º 4
0
filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs:
    # combine 2012, 2013 training and test data
    X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats)
    X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts14_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts14_test_id])
    postprocess(test_input,  y_test)

    fname =  "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)

descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH)
open(descr_fname, "w").write(DESCRIPTION)
filenames.append(descr_fname)

filenames = " ".join(filenames)

zipfile = "STS-en-{}-{}.zip".format(GROUP, APPROACH)

call("zip -rv {} {}".format(zipfile, filenames),
Ejemplo n.º 5
0
from sts.io import read_system_input, read_gold_standard
from sts.score import correlation

train = np.load("_npz_data/_STS2012.train.MSRpar.npz")
clf = SVR(kernel='rbf', C=50, epsilon=.2, gamma=.02)
print clf

clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])

test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0
sys_scores[sys_scores > 5.0] = 5.0
sys_scores[sys_scores < 0.0] = 0.0

# compute correlation score
gold_scores = read_gold_standard(
    "../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)

#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV

#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
Ejemplo n.º 6
0
filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id in id_pairs:
    # combine 2012 training and test data
    X_sts12_train, y_sts12_train = read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = read_test_data(sts12_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts13_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts13_test_id])
    postprocess(test_input, y_test)

    fname = "{}/STScore.output.{}.txt".format(out_dir, sts13_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)

descr_fname = "{}/STScore-{}-{}.description.txt".format(
    out_dir, GROUP, APPROACH)
open(descr_fname, "w").write(DESCRIPTION)
filenames.append(descr_fname)

filenames = " ".join(filenames)

zipfile = "STScore-{}-{}.zip".format(GROUP, APPROACH)