コード例 #1
0
ファイル: io.py プロジェクト: pombredanne/STS13
def read_data(ids, feat_fnames, gs_fnames, features=[], convert_nan=True):
    if isinstance(ids, basestring):
        ids = [ids]

    y = read_gold_standard(gs_fnames[ids[0]])["gold"]
    
    if features:
        # select filenames for desired features
        filenames = [feat_fnames[ids[0]][f] for f in features]
    else:
        # default to using all features for dataset
        filenames = feat_fnames[ids[0]].values()
        
    X = read_features(filenames, num_vals=len(y))
    
    for id in ids[1:]:
        y2 = read_gold_standard(gs_fnames[id])["gold"]
        y = np.hstack([y, y2])
        X2 = read_features(filenames, num_vals=len(y2))
        X = np.vstack([X, X2])
        
    if convert_nan:
        X = np.nan_to_num(X)        
        
    return X, y
コード例 #2
0
ファイル: io.py プロジェクト: STS-NTNU/STS13
def read_data(ids, feat_fnames, gs_fnames, features=[], convert_nan=True):
    """
    Create feature vectors and labels. 
    
    Parameters
    ----------
    ids: str or list of str
        dataset identifier(s)
    feat_names: list of str
        mapping from feature nams to feature files
    gs_fnames: list of str
        gold standard filenames
    features:
        feature names
    convert_nan: True or False
        replace nan with zero and inf with finite numbers in feature values
    
    Returns
    -------
    X, y: numpy.array, numpy.array
        2-dimesional array of feature values and 1-dimensional array of labels,
        intended for use with sklearn     
    """
    if isinstance(ids, basestring):
        ids = [ids]

    y = read_gold_standard(gs_fnames[ids[0]])["gold"]
    
    if features:
        # select filenames for desired features
        filenames = [feat_fnames[ids[0]][f] for f in features]
    else:
        # default to using all features for dataset
        filenames = feat_fnames[ids[0]].values()
        
    X = read_features(filenames, num_vals=len(y))
    
    for id in ids[1:]:
        if features:
            # select filenames for desired features
            filenames = [feat_fnames[id][f] for f in features]
        else:
            # default to using all features for dataset
            filenames = feat_fnames[id].values()

        y2 = read_gold_standard(gs_fnames[id])["gold"]
        y = np.hstack([y, y2])
        X2 = read_features(filenames, num_vals=len(y2))
        X = np.vstack([X, X2])
        
    if convert_nan:
        X = np.nan_to_num(X)        
        
    return X, y
コード例 #3
0
def read_data(ids, feat_fnames, gs_fnames, features=[], convert_nan=True):
    """
    Create feature vectors and labels. 
    
    Parameters
    ----------
    ids: str or list of str
        dataset identifier(s)
    feat_names: list of str
        mapping from feature nams to feature files
    gs_fnames: list of str
        gold standard filenames
    features:
        feature names
    convert_nan: True or False
        replace nan with zero and inf with finite numbers in feature values
    
    Returns
    -------
    X, y: numpy.array, numpy.array
        2-dimesional array of feature values and 1-dimensional array of labels,
        intended for use with sklearn     
    """
    if isinstance(ids, basestring):
        ids = [ids]

    y = read_gold_standard(gs_fnames[ids[0]])["gold"]

    if features:
        # select filenames for desired features
        filenames = [feat_fnames[ids[0]][f] for f in features]
    else:
        # default to using all features for dataset
        filenames = feat_fnames[ids[0]].values()

    X = read_features(filenames, num_vals=len(y))

    for id in ids[1:]:
        if features:
            # select filenames for desired features
            filenames = [feat_fnames[id][f] for f in features]
        else:
            # default to using all features for dataset
            filenames = feat_fnames[id].values()

        y2 = read_gold_standard(gs_fnames[id])["gold"]
        y = np.hstack([y, y2])
        X2 = read_features(filenames, num_vals=len(y2))
        X = np.vstack([X, X2])

    if convert_nan:
        X = np.nan_to_num(X)

    return X, y
コード例 #4
0
ファイル: run_exp_1.py プロジェクト: STS-NTNU/STS13
clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])
    
test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0 
sys_scores[sys_scores > 5.0] = 5.0 
sys_scores[sys_scores < 0.0] = 0.0 

# compute correlation score
gold_scores = read_gold_standard("../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)


#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV
    
#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = KFold(train["y"].size, k=3, shuffle=True)
#grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv)
#grid.fit(train["X"], train["y"])

#print("The best classifier is: ", grid.best_estimator_)
コード例 #5
0
ファイル: scatterplots.py プロジェクト: STS-NTNU/STS13
make scatterplots of system output on STS12 test data
"""

import numpy as np
import matplotlib.pyplot as plt

from sts.io import read_gold_standard, read_system_output
from sts.score import correlation

# Takelab system

for data in "MSRpar", "MSRvid", "SMTeuroparl", "surprise.OnWN", "surprise.SMTnews":
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    gold = read_gold_standard("../../data/STS2012-test/STS.gs.{}.txt".format(data))
    
    out = read_system_output("takelab-out/{}-output.txt".format(data.lower()))
    
    ax.plot(gold, out, ".")
    r = correlation(gold["gold"], out["output"])
    
    ax.set_xlim(-0.5,5.5)
    ax.set_ylim(-0.5,5.5)
    ax.set_xlabel("Gold")
    ax.set_ylabel("System")
    ax.set_title("TakeLab.TST12.Test.{} (n={}, r={})".format(data, len(out), r))
    ax.grid(True)
    

    plt.savefig("scatter-takelab-tst12-test-{}.png".format(data))
コード例 #6
0
ファイル: run_exp_1.py プロジェクト: Huskyeder/STS13
clf.fit(train["X"], train["y"])
#print clf.score(train["X"], train["y"])

test = np.load("_npz_data/_STS2012.test.MSRpar.npz")
#print clf.score(test["X"], test["y"])
sys_scores = clf.predict(test["X"])

# postprocess
sys_inp = read_system_input("../../data/STS2012-test/STS.input.MSRpar.txt")
sys_scores[sys_inp["s1"] == sys_inp["s2"]] = 5.0
sys_scores[sys_scores > 5.0] = 5.0
sys_scores[sys_scores < 0.0] = 0.0

# compute correlation score
gold_scores = read_gold_standard(
    "../../data/STS2012-test/STS.gs.MSRpar.txt")["gold"]
print correlation(gold_scores, sys_scores)

#from sklearn.cross_validation import KFold
#from sklearn.grid_search import GridSearchCV

#C_range = 10.0 ** np.arange(-2, 9)
#gamma_range = 10.0 ** np.arange(-5, 4)
#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = KFold(train["y"].size, k=3, shuffle=True)
#grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv)
#grid.fit(train["X"], train["y"])

#print("The best classifier is: ", grid.best_estimator_)