def read_data(ids, feat_fnames, gs_fnames, features=[], convert_nan=True): if isinstance(ids, basestring): ids = [ids] y = read_gold_standard(gs_fnames[ids[0]])["gold"] if features: # select filenames for desired features filenames = [feat_fnames[ids[0]][f] for f in features] else: # default to using all features for dataset filenames = feat_fnames[ids[0]].values() X = read_features(filenames, num_vals=len(y)) for id in ids[1:]: y2 = read_gold_standard(gs_fnames[id])["gold"] y = np.hstack([y, y2]) X2 = read_features(filenames, num_vals=len(y2)) X = np.vstack([X, X2]) if convert_nan: X = np.nan_to_num(X) return X, y
def read_data(ids, feat_fnames, gs_fnames, features=[], convert_nan=True): """ Create feature vectors and labels. Parameters ---------- ids: str or list of str dataset identifier(s) feat_names: list of str mapping from feature nams to feature files gs_fnames: list of str gold standard filenames features: feature names convert_nan: True or False replace nan with zero and inf with finite numbers in feature values Returns ------- X, y: numpy.array, numpy.array 2-dimesional array of feature values and 1-dimensional array of labels, intended for use with sklearn """ if isinstance(ids, basestring): ids = [ids] y = read_gold_standard(gs_fnames[ids[0]])["gold"] if features: # select filenames for desired features filenames = [feat_fnames[ids[0]][f] for f in features] else: # default to using all features for dataset filenames = feat_fnames[ids[0]].values() X = read_features(filenames, num_vals=len(y)) for id in ids[1:]: if features: # select filenames for desired features filenames = [feat_fnames[id][f] for f in features] else: # default to using all features for dataset filenames = feat_fnames[id].values() y2 = read_gold_standard(gs_fnames[id])["gold"] y = np.hstack([y, y2]) X2 = read_features(filenames, num_vals=len(y2)) X = np.vstack([X, X2]) if convert_nan: X = np.nan_to_num(X) return X, y
clf.fit(train["X"], train["y"]) #print clf.score(train["X"], train["y"]) test = np.load("_npz_data/_STS2012.test.MSRpar.npz") #print clf.score(test["X"], test["y"]) sys_scores = clf.predict(test["X"]) # postprocess sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt") sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 sys_scores[sys_scores > 5.0] = 5.0 sys_scores[sys_scores < 0.0] = 0.0 # compute correlation score gold_scores = read_gold_standard("../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"] print correlation(gold_scores, sys_scores) #from sklearn.cross_validation import KFold #from sklearn.grid_search import GridSearchCV #C_range = 10.0 ** np.arange(-2, 9) #gamma_range = 10.0 ** np.arange(-5, 4) #param_grid = dict(gamma=gamma_range, C=C_range) #cv = KFold(train["y"].size, k=3, shuffle=True) #grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv) #grid.fit(train["X"], train["y"]) #print("The best classifier is: ", grid.best_estimator_)
make scatterplots of system output on STS12 test data """ import numpy as np import matplotlib.pyplot as plt from sts.io import read_gold_standard, read_system_output from sts.score import correlation # Takelab system for data in "MSRpar", "MSRvid", "SMTeuroparl", "surprise.OnWN", "surprise.SMTnews": fig = plt.figure() ax = fig.add_subplot(111) gold = read_gold_standard("../../data/STS2012-test/STS.gs.{}.txt".format(data)) out = read_system_output("takelab-out/{}-output.txt".format(data.lower())) ax.plot(gold, out, ".") r = correlation(gold["gold"], out["output"]) ax.set_xlim(-0.5,5.5) ax.set_ylim(-0.5,5.5) ax.set_xlabel("Gold") ax.set_ylabel("System") ax.set_title("TakeLab.TST12.Test.{} (n={}, r={})".format(data, len(out), r)) ax.grid(True) plt.savefig("scatter-takelab-tst12-test-{}.png".format(data))
clf.fit(train["X"], train["y"]) #print clf.score(train["X"], train["y"]) test = np.load("_npz_data/_STS2012.test.MSRpar.npz") #print clf.score(test["X"], test["y"]) sys_scores = clf.predict(test["X"]) # postprocess sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt") sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 sys_scores[sys_scores > 5.0] = 5.0 sys_scores[sys_scores < 0.0] = 0.0 # compute correlation score gold_scores = read_gold_standard( "../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"] print correlation(gold_scores, sys_scores) #from sklearn.cross_validation import KFold #from sklearn.grid_search import GridSearchCV #C_range = 10.0 ** np.arange(-2, 9) #gamma_range = 10.0 ** np.arange(-5, 4) #param_grid = dict(gamma=gamma_range, C=C_range) #cv = KFold(train["y"].size, k=3, shuffle=True) #grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv) #grid.fit(train["X"], train["y"]) #print("The best classifier is: ", grid.best_estimator_)