def run_experiment(frame, test_ids, train_ids=ALL_TRAIN_IDS, feats=FEATS, svm_grid=SMALL_PARAM_GRID, cv=5, n_jobs=1, **drc_args): keyed_train_data = {} for data_id in train_ids: m = frame[frame.data_id == data_id][feats].as_matrix() keyed_train_data[data_id] = m keyed_test_data = {} for data_id in test_ids: m = frame[frame.data_id == data_id][feats].as_matrix() keyed_test_data[data_id] = m drc = DatasetRelevanceClassifier(**drc_args) drc.fit(keyed_train_data) result = [] for data_id, X in keyed_test_data.items(): train_sets = drc.predict(X) x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix() y_train = frame[frame.data_id.isin(train_sets)]['gs'].values x_test = frame[frame.data_set == 'STS2013-test'] x_test = x_test[x_test.data_id == data_id][feats].as_matrix() y_test = frame[frame.data_set == 'STS2013-test'] y_test = y_test[y_test.data_id == data_id]['gs'].values # reserve second half of test sets for final evaluation x_test = x_test[0:len(y_test) / 2, :] y_test = y_test[0:len(y_test) / 2] if svm_grid: grid = GridSearchCV(SVR(), svm_grid, cv=cv, verbose=1, n_jobs=n_jobs) grid.fit(x_train, y_train) best_params = grid.best_params_ else: best_params = {} model = SVR(**best_params) model.fit(x_train, y_train) pred = model.predict(x_test) score = correlation(y_test, pred) result.append((data_id, score, train_sets, best_params)) return result
def run_experiment(frame, test_ids, train_ids=ALL_TRAIN_IDS, feats=FEATS, svm_grid=SMALL_PARAM_GRID, cv=5, n_jobs=1,**drc_args): keyed_train_data = {} for data_id in train_ids: m = frame[frame.data_id == data_id][feats].as_matrix() keyed_train_data[data_id] = m keyed_test_data = {} for data_id in test_ids: m = frame[frame.data_id == data_id][feats].as_matrix() keyed_test_data[data_id] = m drc = DatasetRelevanceClassifier(**drc_args) drc.fit(keyed_train_data) result = [] for data_id, X in keyed_test_data.items(): train_sets = drc.predict(X) x_train = frame[frame.data_id.isin(train_sets)][feats].as_matrix() y_train = frame[frame.data_id.isin(train_sets)]['gs'].values x_test = frame[frame.data_set == 'STS2013-test'] x_test = x_test[x_test.data_id == data_id][feats].as_matrix() y_test = frame[frame.data_set == 'STS2013-test'] y_test = y_test[y_test.data_id == data_id]['gs'].values # reserve second half of test sets for final evaluation x_test = x_test[0:len(y_test)/2, :] y_test = y_test[0:len(y_test)/2] if svm_grid: grid = GridSearchCV(SVR(), svm_grid, cv=cv, verbose=1, n_jobs=n_jobs) grid.fit(x_train, y_train) best_params = grid.best_params_ else: best_params = {} model = SVR(**best_params) model.fit(x_train, y_train) pred = model.predict(x_test) score = correlation(y_test, pred) result.append((data_id, score, train_sets, best_params)) return result
keyed_test_data = {} for data_id in ALL_TEST_IDS: m = df[df.data_id == data_id][feats].as_matrix() keyed_test_data[data_id] = m drc = DatasetRelevanceClassifier(method='k-means', selection='top', representative='medoid', n_clusters=3) drc.fit(keyed_train_data) scores = [] for data_id, X in keyed_test_data.items(): train_sets = drc.predict(X) x_train = df[df.data_id.isin(train_sets)][feats].as_matrix() y_train = df[df.data_id.isin(train_sets)]['gs'].values x_test = df[df.data_set == 'STS2013-test'] x_test = x_test[x_test.data_id == data_id][feats].as_matrix() y_test = df[df.data_set == 'STS2013-test'] y_test = y_test[y_test.data_id == data_id]['gs'].values # use second half of test sets for final evaluation x_test = x_test[len(y_test) / 2:, :] y_test = y_test[len(y_test) / 2:] model = SVR() model.fit(x_train, y_train)
m = df[df.data_id == data_id][feats].as_matrix() keyed_train_data[data_id] = m keyed_test_data = {} for data_id in ALL_TEST_IDS: m = df[df.data_id == data_id][feats].as_matrix() keyed_test_data[data_id] = m drc = DatasetRelevanceClassifier(method='k-means', selection='top', representative='medoid', n_clusters=3) drc.fit(keyed_train_data) scores = [] for data_id, X in keyed_test_data.items(): train_sets = drc.predict(X) x_train = df[df.data_id.isin(train_sets)][feats].as_matrix() y_train = df[df.data_id.isin(train_sets)]['gs'].values x_test = df[df.data_set == 'STS2013-test'] x_test = x_test[x_test.data_id == data_id][feats].as_matrix() y_test = df[df.data_set == 'STS2013-test'] y_test = y_test[y_test.data_id == data_id]['gs'].values # use second half of test sets for final evaluation x_test = x_test[len(y_test)/2:, :] y_test = y_test[len(y_test)/2:] model = SVR() model.fit(x_train, y_train)