def learn(examples, Classifier, classifierArgs, develFolds=10, verbose=3, n_jobs=1, predKey="ml_comb_pred", limitTerms=None): print "Parameter grid search" develExamples = getSubset(examples, ["devel"]) clf = GridSearchCV(Classifier(), classifierArgs, cv=develFolds, verbose=verbose, n_jobs=n_jobs, scoring="f1_micro") clf.fit(develExamples["features"], develExamples["classes"]) print "Best params", (clf.best_params_, clf.best_score_) print "Predicting all examples" minMax = MinMaxScaler((0.03, 1.0)) allPredictions = clf.predict(examples["features"]) if hasattr(clf, "predict_proba"): allProbabilities = clf.predict_proba(examples["features"]) else: allProbabilities = clf.decision_function(examples["features"]) #import pdb; pdb.set_trace() minMax.fit( allProbabilities) #minmax_scale(testProbabilities, (0.03, 1.0)) allProbabilities = minMax.transform( allProbabilities ) #allProbabilities = minmax_scale(allProbabilities, (0.03, 1.0)) print "Predicting the test set" testExamples = getSubset(examples, ["test"]) testPredictions = clf.predict(testExamples["features"]) if hasattr(clf, "predict_proba"): testProbabilities = clf.predict_proba(testExamples["features"]) else: testProbabilities = clf.decision_function(testExamples["features"]) testProbabilities = minMax.transform(testProbabilities) binaryToMultiLabel(testExamples, testPredictions, testProbabilities, predKey) print "Evaluating test set ensemble predictions" testProteins = {x["id"]: x for x in testExamples["proteins"]} multiLabelTestExamples = evaluateFile.makeExamples(testProteins, limitTerms=limitTerms, limitToSets=["test"], predKey=predKey) loading.vectorizeExamples(multiLabelTestExamples, None, sparseLabels=True) results = evaluation.evaluate(multiLabelTestExamples["labels"], multiLabelTestExamples["predictions"], multiLabelTestExamples, terms=None, averageOnly=True, noAUC=True) print "Average for test set:", evaluation.metricsToString( results["average"]) binaryToMultiLabel(examples, allPredictions, allProbabilities, predKey)
def classification_by_monkey(X, y, labelset, param_grid, stream, n_folds_test=10, n_folds_gridsearch=5, verbose=True): for monkey in X.keys(): if verbose: print '-' * len(monkey) print monkey print '-' * len(monkey) print >>stream, '***', monkey y_true = None y_pred = None pvals = None print >>stream, '\n**** Cross-validation scores\n' for fold in range(n_folds_test): if verbose: print ' FOLD:', fold X_train, X_test, y_train, y_test = train_test_split(X[monkey], y[monkey], test_size=0.1) if verbose: print 'training classifier...' clf = GridSearchCV(SVC(), param_grid, cv=n_folds_gridsearch, scoring='f1', verbose=1 if verbose else 0, n_jobs=-1) clf.fit(X_train, y_train) print >>stream, 'FOLD:', fold, clf.best_score_ print >>stream, pformat(clf.best_params_) if verbose: print 'predicting class labels...' if y_true is None: y_true = y_test y_pred = clf.predict(X_test) pvals = expit(clf.decision_function(X_test)) else: y_true = np.hstack((y_true, y_test)) y_pred = np.hstack((y_pred, clf.predict(X_test))) pvals = np.hstack((pvals, expit(clf.decision_function(X_test)))) print >>stream, '\n**** Classification report\n' print >>stream, metrics.classification_report(y_true, y_pred, target_names=labelset[monkey]) print >>stream, '\n**** Confusion matrix\n' print_cm(stream, metrics.confusion_matrix(y_true, y_pred), labelset[monkey]) print >>stream, '' stream.flush() with open('results/clf_by_monkey_{0}_blue_merged.pkl'.format(monkey), 'wb') as fid: pickle.dump((y_true, y_pred, pvals, labelset[monkey]), fid, -1)
def fit_validate_and_predict(self, train_idx, test_idx, sklearn_model, sklearn_params): print(sklearn_params) clf = GridSearchCV(sklearn_model, sklearn_params, scoring='roc_auc', n_jobs=8, cv=5, verbose=4) clf.fit(self.data[train_idx, :], self.labels[train_idx] - 1) # handle '' added for test labels print('Best Estimator:') print(clf.best_estimator_) print() print('Grid Scores:') for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() try: prob = clf.predict_proba(self.data[test_idx, :]) except: scores = clf.decision_function(self.data[test_idx, :]) prob = 1. / (1. + np.exp(-scores)) return prob
def gridsearch(model, params): gs = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1) gs.fit(X_train, y_train) print ('Best params: ', gs.best_params_) print ('Best auc on training set: ', gs.best_score_) print ('Best auc on test set: ', gs.score(X_test, y_test)) return gs.predict(X_test), gs.decision_function(X_test)
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() grid_search.fit(X, y) sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert_true(grid_search.cv_scores_[i][0] == {'foo_param': foo_i}) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X)
def trainLinearSVC(trainData, trainLabels, testData): print("\nTraining Linear SVC...") trainData = np.asarray(trainData) trainLabels = np.asarray(trainLabels) print(trainData.shape) print(trainLabels.shape) iter = 2000 cross_val = 5 Cs = np.power(2, np.linspace(-3, 9, num=7)) parameters = { "estimator__C": Cs, } osvc = OneVsRestClassifier(LinearSVC(class_weight='balanced', verbose=False, multi_class='ovr', max_iter=iter), n_jobs=-1) svc = GridSearchCV(osvc, cv=cross_val, param_grid=parameters, n_jobs=-1) t0 = time() svc.fit(trainData, trainLabels) print("\nTraining finished in %0.3fs \n" % (time() - t0)) print("Best parameters: ") print(svc.best_params_) print("\nBest estimator: ") print(svc.best_estimator_) print("Best score: ") print(svc.best_score_) t0 = time() predictedLabels = svc.predict(testData) print("\nTesting finished in %0.3fs" % (time() - t0)) t0 = time() confidence_scores = svc.decision_function(testData) print("\nTesting finished in %0.3fs" % (time() - t0)) print("\nPredicted Labels") print("----------------------------------") print(predictedLabels) print("\nConfidence Scores") print("----------------------------------") print(confidence_scores) params = { 'iter': iter, 'cv': cross_val, } return confidence_scores, predictedLabels, params
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() grid_search.fit(X, y) sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert_true(grid_search.cv_scores_[i][0] == {'foo_param': foo_i}) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X)
def trainAndTestLinearSVM_withfolds(train, test, GT_train, GT_test, folds, start, end, numparams): print 'Training and Testing a Linear SVM' init = time.time() stdSlr = StandardScaler().fit(train) train = stdSlr.transform(train) kernelMatrix = histogramIntersection(train, train) tuned_parameters = [{ 'kernel': ['linear'], 'C': np.linspace(start, end, num=numparams) }] clf = GridSearchCV(svm.SVC(kernel='linear', decision_function_shape='ovr'), tuned_parameters, cv=folds, scoring='accuracy', n_jobs=8) clf.fit(kernelMatrix, GT_train) print(clf.best_params_) predictMatrix = histogramIntersection(stdSlr.transform(test), train) SVMpredictions = clf.predict(predictMatrix) correct = sum(1.0 * (SVMpredictions == GT_test)) accuracy = correct / len(GT_test) cm = confusion_matrix(GT_test, SVMpredictions) end = time.time() fpr = dict() tpr = dict() roc_auc = dict() y_score = clf.decision_function(predictMatrix) for i in range(8): fpr[i], tpr[i], _ = roc_curve(np.asarray(GT_test), y_score[:, i], pos_label=i) roc_auc[i] = auc(fpr[i], tpr[i]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(8)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(8): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= 8 fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) print 'Done in ' + str(end - init) + ' secs.' return accuracy, cm, fpr, tpr, roc_auc
def fit_predict_classification_pr(X,y,Xtest,ytest): #Linear SVC with hinge loss metric="roc_auc_score" param={'C':[100,10,1,0.1,0.01,0.001,0.0001]} model=GridSearchCV(svm.LinearSVC(loss='hinge'),param,scoring=metric) model.fit(X,y) print "Model:LinearSVC, metric:%s, best_param:" %(metric), model.best_params_ print model.grid_scores_ ypred=model.decision_function(Xtest) return {'ypred': ypred, 'ytest': ytest, 'auprc': metrics.roc_auc_score(ytest,ypred)}
def test_grid_search(): # Test that the best estimator contains the right value for foo_param clf = MockClassifier() grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() grid_search.fit(X, y) sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert_true(grid_search.grid_scores_[i][0] == {"foo_param": foo_i}) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X) # Test exception handling on scoring grid_search.scoring = "sklearn" assert_raises(ValueError, grid_search.fit, X, y)
def fit_predict_classification_pr(X,y,Xtest,ytest): #Linear SVC with hinge loss metric="roc_auc_score" param={'C':[100,10,1,0.1,0.01,0.001,0.0001]} model=GridSearchCV(svm.LinearSVC(loss='hinge'),param,scoring=metric) model.fit(X,y) print "Model:LinearSVC, metric:%s, best_param:" %(metric), model.best_params_ print model.grid_scores_ ypred=model.decision_function(Xtest) return {'ypred': ypred, 'ytest': ytest, 'auprc': metrics.roc_auc_score(ytest,ypred)}
def fit_validate_and_predict(self, train_idx, test_idx, sklearn_model, sklearn_params): print(sklearn_params) clf = GridSearchCV(sklearn_model, sklearn_params, scoring='roc_auc', n_jobs=8, cv=5, verbose=4) clf.fit(self.data[train_idx, :], self.labels[train_idx]-1) # handle '' added for test labels print('Best Estimator:') print(clf.best_estimator_) print() print('Grid Scores:') for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() try: prob = clf.predict_proba(self.data[test_idx, :]) except: scores = clf.decision_function(self.data[test_idx, :]) prob = 1. / (1. + np.exp(-scores)) return prob
def fit_clf(args, clf_name, val, n_fold, project_name, save, scoring): ''' Run fit method from val with X and y clf_name is a string with the classifier name ''' train, test = args[0]['kf'][n_fold] X = args[0]['X'][train, :] y = args[0]['y'][train] file_name = 'poly_{}/models/{}_{}.p'.format( project_name, clf_name, n_fold + 1) start = time.time() if os.path.isfile(file_name): logger.info('Loading {} {}'.format(file_name, n_fold)) clf = joblib.load(file_name) else: logger.info('Training {} {}'.format(clf_name, n_fold)) clf = deepcopy(val['clf']) if val['parameters']: clf = GridSearchCV(clf, val['parameters'], n_jobs=1, cv=3, scoring=_scorer) clf.fit(X, y) if save: joblib.dump(clf, file_name) train_score = _scorer(clf, X, y) X = args[0]['X'][test, :] y = args[0]['y'][test] # Scores test_score = _scorer(clf, X, y) ypred = clf.predict(X) if hasattr(clf, 'predict_proba'): yprob = clf.predict_proba(X) elif hasattr(clf, 'decision_function'): yprob = clf.decision_function(X) confusion = confusion_matrix(y, ypred) duration = time.time() - start logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format( clf_name, n_fold, train_score, test_score, duration)) return (train_score, test_score, ypred, yprob, # predictions and probabilities confusion, # confusion matrix clf) # fitted clf
class PageClassifier(object): def __init__(self, feature_extractors): self._fx_exts = feature_extractors self._clf = None def train(self, wikicode_list, labels): features = [ext.fit_extract(wikicode_list) for ext in self._fx_exts] X = np.concatenate(features, axis=1) kbest = SelectKBest(f_classif) # model = RandomForestClassifier( # class_weight='balanced') # model = GradientBoostingClassifier() model = LinearSVC(class_weight='balanced', dual=False, penalty='l1') pipe = Pipeline([('kbest', kbest), ('model', model)]) self._clf = GridSearchCV(pipe, {'kbest__k': list(range(1, X.shape[1], 10))}, scoring='roc_auc', cv=10 ).fit(X, labels) @_ensure_trained def predict(self, wikicode_list): X = self._extract_feature_vectors_from_wikicode_list(wikicode_list) return self._clf.predict(X) @_ensure_trained def predict_proba(self, wikicode_list): X = self._extract_feature_vectors_from_wikicode_list(wikicode_list) # return [cls1 for cls0, cls1 in self._clf.predict_proba(X)] return self._clf.decision_function(X) def _extract_feature_vectors_from_wikicode_list(self, wikicode_list): features = [ext.extract(wikicode_list) for ext in self._fx_exts] X = np.concatenate(features, axis=1) return X
def logreg(): # Считайте таблицу с признаками из файла features.csv. train_data = pd.read_csv('data/features_training.csv', index_col='match_id') X_train = train_data.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis=1) y_train = train_data['radiant_win'] # Замените пропуски на нули с помощью функции fillna(). X_train.fillna(value=0, inplace=True) # 1. Оцените качество логистической регрессии (sklearn.linear_model.LogisticRegression с L2-регуляризацией) с # помощью кросс-валидации по той же схеме, которая использовалась для градиентного бустинга. Подберите при этом # лучший параметр регуляризации (C). Какое наилучшее качество у вас получилось? Как оно соотносится с качеством # градиентного бустинга? Чем вы можете объяснить эту разницу? Быстрее ли работает логистическая регрессия по # сравнению с градиентным бустингом? scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = pd.DataFrame(scaler.transform(X_train)) # Разобъем данные на тестовую и обучающую выборки X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_train_scaled, y_train, test_size = 0.5, random_state = 1) # Зафиксируйте генератор разбиений для кросс-валидации по 5 блокам (KFold), не # забудьте перемешать при этом выборку (shuffle=True), поскольку данные в таблице отсортированы по времени, и без # перемешивания можно столкнуться с нежелательными эффектами при оценивании качества. k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1) # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C': logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))} clf_logreg = LogisticRegression(random_state=1, verbose=0) clf_logreg_grid = GridSearchCV(clf_logreg, logreg_params, cv=k_fold, scoring='roc_auc' ) clf_logreg_grid.fit(X_small_train, y_small_train) y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train) y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test) # Какое качество получилось у логистической регрессии над всеми исходными признаками? # Как оно соотносится с качеством градиентного бустинга? Чем можно объяснить эту разницу? # Быстрее ли работает логистическая регрессия по сравнению с градиентным бустингом? print(clf_logreg_grid.best_params_) print(roc_auc_score(y_small_train, y_train_score_logreg)) print(roc_auc_score(y_small_test, y_test_score_logreg)) ################################################################################################################### # 2. Среди признаков в выборке есть категориальные, которые мы использовали как числовые, что вряд ли является # хорошей идеей. Категориальных признаков в этой задаче одиннадцать: lobby_type и r1_hero, r2_hero, ..., r5_hero, # d1_hero, d2_hero, ..., d5_hero. Уберите их из выборки, и проведите кросс-валидацию для логистической регрессии # на новой выборке с подбором лучшего параметра регуляризации. Изменилось ли качество? Чем вы можете это объяснить? X_train_cleaned = X_train.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1) # масштабируем признаки scaler = StandardScaler() scaler.fit(X_train_cleaned) X_train_cleaned_scaled = pd.DataFrame(scaler.transform(X_train_cleaned)) # Разобъем данные на тестовую и обучающую выборки X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_train_cleaned_scaled, y_train, test_size=0.5, random_state=1) k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1) # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C': logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))} clf_logreg = LogisticRegression(random_state=1, verbose=0) clf_logreg_grid = GridSearchCV(clf_logreg, logreg_params, cv=k_fold, scoring='roc_auc' ) clf_logreg_grid.fit(X_small_train, y_small_train) y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train) y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test) # Какое качество получилось у логистической регрессии над некатегориальными исходными признаками? print(clf_logreg_grid.best_params_) print(roc_auc_score(y_small_train, y_train_score_logreg)) print(roc_auc_score(y_small_test, y_test_score_logreg)) ################################################################################################################### # 3. На предыдущем шаге мы исключили из выборки признаки rM_hero и dM_hero, которые показывают, какие именно герои # играли за каждую команду. Это важные признаки — герои имеют разные характеристики, и некоторые из них # выигрывают чаще, чем другие. Выясните из данных, сколько различных идентификаторов героев существует в данной # игре (вам может пригодиться фукнция unique или value_counts). heroes = train_data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']] heroes = heroes.stack().unique() #print(heroes.head()) print(len(heroes)) ################################################################################################################### # 4. Воспользуемся подходом "мешок слов" для кодирования информации о героях. Пусть всего в игре имеет N различных # героев. Сформируем N признаков, при этом i-й будет равен # нулю, если i-й герой не участвовал в матче; # единице, если i-й герой играл за команду Radiant; # минус единице, если i-й герой играл за команду Dire. # N — количество различных героев в выборке N = heroes.max() X_pick = numpy.zeros((X_train.shape[0], N)) for i, match_id in enumerate(X_train.index): for p in range(5): X_pick[i, X_train.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1 X_pick[i, X_train.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1 #print(X_pick) X_pick_df = pd.DataFrame(X_pick) X_pick_df.columns = range(1, N + 1) cols = [col for col in X_pick_df.columns if col in heroes] X_pick_df = X_pick_df[cols] X_fin = pd.concat([X_train_cleaned_scaled, X_pick_df], axis = 1) #print(X_fin.head()) ################################################################################################################### # 5. Проведите кросс-валидацию для логистической регрессии на новой выборке с подбором лучшего параметра # регуляризации. Какое получилось качество? Улучшилось ли оно? Чем вы можете это объяснить? # Разобъем данные на тестовую и обучающую выборки X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_fin, y_train, test_size=0.5, random_state=1) k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1) # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C': logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))} clf_logreg = LogisticRegression(random_state=1, verbose=0) clf_logreg_grid = GridSearchCV(clf_logreg, logreg_params, cv=k_fold, scoring='roc_auc' ) clf_logreg_grid.fit(X_small_train, y_small_train) y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train) y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test) # Какое качество получилось у логистической регрессии над некатегориальными исходными признаками? print(clf_logreg_grid.best_params_) print(roc_auc_score(y_small_train, y_train_score_logreg)) print(roc_auc_score(y_small_test, y_test_score_logreg)) ################################################################################################################### # 6. Постройте предсказания вероятностей победы команды Radiant для тестовой выборки с помощью лучшей из изученных # моделей (лучшей с точки зрения AUC-ROC на кросс-валидации). Убедитесь, что предсказанные вероятности адекватные # — находятся на отрезке [0, 1], не совпадают между собой (т.е. что модель не получилась константной). # Считайте таблицу с признаками из файла features.csv. test_data = pd.read_csv('data/features_test.csv', index_col='match_id') X_test = test_data.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1) # Замените пропуски на нули с помощью функции fillna(). X_test.fillna(value=0, inplace=True) scaler = StandardScaler() scaler.fit(X_test) X_test_scaled = pd.DataFrame(scaler.transform(X_test)) # соберем список героев для "bag of words" heroes = test_data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].stack().unique() # Воспользуемся подходом "мешок слов" для кодирования информации о героях. Пусть всего в игре имеет N различных # героев. Сформируем N признаков, при этом i-й будет равен # нулю, если i-й герой не участвовал в матче; # единице, если i-й герой играл за команду Radiant; # минус единице, если i-й герой играл за команду Dire. # N — количество различных героев в выборке N = heroes.max() X_pick = numpy.zeros((X_test.shape[0], N)) for i, match_id in enumerate(X_test.index): for p in range(5): X_pick[i, test_data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1 X_pick[i, test_data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1 #print(X_pick) X_pick_df = pd.DataFrame(X_pick) X_pick_df.columns = range(1, N + 1) cols = [col for col in X_pick_df.columns if col in heroes] X_pick_df = X_pick_df[cols] X_fin = pd.concat([X_test_scaled, X_pick_df], axis=1) #print(X_fin.head()) logistic = clf_logreg_grid.best_estimator_.predict_proba(X_fin) #print(logistic[:, 1]) print('Min probability: %s' % str(min(logistic[:, 1]))) print('Max probability: %s' % str(max(logistic[:, 1])))
def main(): model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) f = open('test_data.csv','wb') w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\") w.writeheader() # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(docs.get_ids()) no_studies = len(uids) kf = KFold(no_studies, n_folds=5, shuffle=False) tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]} vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space for k_i, (train, test) in enumerate(kf): if k_i == 1: break y_train = docs.y(uids[train]) vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions X_train = vec.builder_fit_transform() clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16) # import pdb; pdb.set_trace() clf.fit(X_train, y_train) del X_train, y_train clf = clf.best_estimator_ # and we only need the best performing, discard the rest # Test on each domain in turn # filtered_data = riskofbias.SentFilter(data) for domain in riskofbias.CORE_DOMAINS: print "Testing on %s" % domain vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions X_test = vec.builder_transform() y_test = docs.y(uids[test], domain=domain) y_preds = clf.predict(X_test) y_df = clf.decision_function(X_test) # get distances from the decision boundary # positive distances = more likely to be relevant sentences r_len = len(y_preds) y_top3 = [] y_top1 = [] y_rand = [] y_uids = np.array(docs.y_uids(uids[test], domain=domain)) # import pdb; pdb.set_trace() for y_uid in np.unique(y_uids): mask = np.where(y_uids == y_uid)[0] doc_df = y_df[mask] doc_top3 = np.argpartition(doc_df, -3)[-3:] y_top3.extend(list(mask[doc_top3])) doc_top1 = np.argmax(doc_df) y_top1.append(mask[doc_top1]) doc_rand = np.random.randint(0, len(doc_df)) y_rand.append(mask[doc_rand]) human_sent_indices = np.where(y_test==1)[0] algorithm_sent_indices = np.where(y_preds==1)[0] model_metrics.add_preds_test(y_preds, y_test, domain=domain) stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain) # import pdb; pdb.set_trace() for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))): row = {"domain": domain, "sent_text": doc, "random": doc_i in y_rand, "human": doc_i in human_sent_indices, "algorithm": doc_i in algorithm_sent_indices, "top3": doc_i in y_top3, "top1": doc_i in y_top1, "pmid": pmid} if row["random"] or row["human"] or row["top3"] or row["top1"]: # please note, the sentences will only be included in the analysis if # in the top1 or top3 # we do have data on whether the raw classifier has predicted yes/no # # this in effect means where the classifier picks <= 3 sentences # we use all raw classifier data # where >3 sentences are predicted by raw classifier, only the # top 3 are used; the rest are discarded w.writerow(row) del X_test, y_test, y_preds del clf model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline"))) f.close()
class WangBaseSenser(BaseSenser): """Abstract class for disambiguating relation senses. Attributes: n_y (int): number of distinct classes """ __metaclass__ = abc.ABCMeta # private members PARAM_GRID = {"clf__C": [float(i) / 100. for i in xrange(1, 3)]} N_JOBS = -1 def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Initialize classifier. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf or LinearSVC(C=DFLT_C, **DFLT_PARAMS) self._gs = a_grid_search self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)]) def train(self, a_train_data, a_dev_data=None, a_n_y=-1, a_i=-1, a_train_out=None, a_dev_out=None): """Method for training the model. Args: a_train_data (tuple[list, dict]): list of training JSON data a_dev_data (tuple[list, dict] or None): list of development JSON data a_n_y (int): number of distinct classes a_i (int): row index for the output predictions a_train_out (np.array or None): predictions for the training set a_dev_out (np.array or None): predictions for the training set Returns: void: Note: updates ``a_train_out`` and ``a_dev_out`` in place """ self.n_y = a_n_y x_train, y_train = self._generate_ts(a_train_data) x_dev, y_dev = self._generate_ts(a_dev_data) # determine cross-validation and grid-search strategy and fit the model if self._gs: if a_dev_data is None or not a_dev_data[0]: cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True) else: cv = self._devset_cv(y_train, len(y_dev), NFOLDS) x_train = x_train + x_dev y_train = y_train + y_dev scorer = make_scorer(f1_score, average="macro") self._model = GridSearchCV(self._model, self.PARAM_GRID, scoring=scorer, cv=cv, n_jobs=self.N_JOBS, verbose=1) self._model.fit([el[-1] for el in x_train], y_train) # output best hyper-parameters if self._gs: print("Best params:", repr(self._model.best_params_), file=sys.stderr) if a_i >= 0: if a_train_out is not None: if self._gs and a_dev_data and a_dev_data[0]: x_train = x_train[:-len(x_dev)] for i, x_i in x_train: self._predict(x_i, a_train_out[i], a_i) if a_dev_out is not None: for i, x_i in x_dev: self._predict(x_i, a_dev_out[i], a_i) def predict(self, a_rel, a_data, a_ret, a_i): """Method for predicting sense of single relation. Args: a_rel (dict): discourse relation whose sense should be predicted a_data (2-tuple(dict, dict)): list of input JSON data a_ret (np.array): output prediction vector a_i (int): row index in the output vector Returns: void: Note: updates ``a_ret[a_i]`` in place """ feats = self._extract_features(a_rel, a_data[-1]) self._predict(feats, a_ret, a_i) @abc.abstractmethod def _extract_features(self, a_rel, a_parses): """Extract classification features for a given relation. Args: a_rel (dict): discourse relation to extract features for a_parses (dict): parsed sentences Returns: void: """ raise NotImplementedError def _predict(self, a_feats, a_ret, a_i): """Method for predicting sense of single relation. Args: a_feats (dict): features of the input instance a_ret (np.array): output prediction vector a_i (int): row index in the output vector Returns: void: updates ``a_ret[a_i]`` in place """ # obtain model's estimates dec = self._model.decision_function(a_feats) if len(dec.shape) > 1: dec = np.mean(dec, axis=0) # normalize using softmax dec = np.exp(dec) exp_ret = np.sum(dec) or 1e10 dec /= exp_ret # map model's classes to original indices for i, ival in enumerate(dec): a_ret[a_i][self._model.classes_[i]] += ival def _free(self): """Free resources used by the model. """ self.n_y = -1 def _generate_ts(self, a_data): """Generate training set. Args: a_data (2-tuple(list, dict)): input data (discourse relations and parses) Returns: tuple(list, list): lists of input features and expected classes """ x, y = [], [] if a_data is None: return (x, y) x_i = y_i = None # generate features for i, irel in a_data[0]: x_i = self._extract_features(irel, a_data[1]) if not x_i: continue x.append((i, x_i)) y_i = np.argmax(irel[SENSE]) y.append(y_i) return (x, y) def _devset_cv(self, a_y_train, a_n_dev, a_n_folds): """Generate train-test split from training and development data. Args: a_y_train (list[int]): list of training instances' tags a_n_dev (int): number of devset instances a_n_folds (int): number of folds Returrns: list[tuple]: list of training/testing folds """ folds = [] n_train = len(a_y_train) dev_ids = [n_train + i for i in xrange(a_n_dev)] # create stratified K-folds over the training data skf = StratifiedKFold(a_y_train, a_n_folds) for train_ids, test_ids in skf: folds.append((train_ids, np.concatenate((test_ids, dev_ids)))) return folds
#clf = GridSearchCV(svm.SVC(C=1, probability=True), param_grid, cv=5) #try other alg for svm #clf = GridSearchCV(svm.NuSVC(nu=.5, probability=True), param_grid, cv=5) #clf = GridSearchCV(svm.SVR(degree=3), param_grid, cv=5) clf = GridSearchCV(svm.LinearSVC(C=1, class_weight='balanced'), param_grid, cv=5) #fit the classifier #clf=svm.SVC(C=C, kernel=kernel, gamma=g) #use this line if only run once, w/out param grid clf.fit(Xlearn, Ylearn) confidence_scores = [] best_candidates = [] best_can = None confidence_scores = clf.decision_function(Xtest) #prev_page_image=[names_test[0][names_test[0].index('#'):names_test[0].index(',')]] predictions_by_page = {} for i in range(0, len(names_test)): name_parts = names_test[i].strip().split(',') page_img = name_parts[0] book = name_parts[2] pred = name_parts[1] score = confidence_scores[i] page_id = book + "_" + page_img print(page_id, pred, score) if page_id in predictions_by_page: predictions_by_page[page_id] += [(pred, score)] else:
def multi_SVM(needcv = False): NeedReFetch = False allGenreSongsTrain,allGenreSongsTest = fetchData_eTA(NUM_NEED_PER_GENRE,GENRES,NeedReFetch,USED_GENRES) # allGenreSongsTrain,allGenreSongsTest = featureSelection (allGenreSongsTrain,allGenreSongsTest,method = 'MIC',testmode = False,n_features_to_select = 85) # assert(len(allGenreSongsTrain[0][0]) == 106) TrainX = [] TrainY = [] TestX = [] TestY = [] for i in range(sum(USED_GENRES)): for j in allGenreSongsTrain[i]: TrainX.append(j) TrainY.append(i) for k in allGenreSongsTest[i]: TestX.append(k) TestY.append(i) confuseMat = [[0 for i in range(sum(USED_GENRES))] for j in range(sum(USED_GENRES))]; if not needcv: print "Start SVM training ... " model = SVC(probability=True,decision_function_shape='ovr',kernel = 'rbf',gamma = 0.0078125, C = 8) model.fit(TrainX,TrainY) print "Start SVM predicting ... " PredY = model.predict(TestX) print model.decision_function(TestX) for i in range(len(TestY)): confuseMat[TestY[i]][PredY[i]] += 1 print(clfr(TestY, PredY)) else: tuned_parameters = [ ## remained to be play with {'kernel': ['rbf'], 'gamma': [2**i for i in range(-15,-4)], 'C': [2**i for i in range(-5,8)]}, # {'kernel': ['linear'], 'C': [2**i for i in range(-8,9,2)]}, # {'kernel': ['poly'], 'gamma': [2**i for i in range(-8,9,2)], 'C': [2**i for i in range(-8,9,2)], 'degree':[2,3,4]}, ] print "Start SVM CV ... " clf = GSCV(SVC(decision_function_shape='ovr'), tuned_parameters, cv=7) clf.fit(TrainX, TrainY) print clf.decision_function(TestX) print("Best parameters set found on development set:") print(clf.best_params_) # print("Grid scores on development set:") # print() # for params, mean_score, scores in clf.grid_scores_: # print("%0.4f (+/-%0.03f) for %r" % (mean_score, scores.std(), params)) # print() print "Start SVM predicting ... " PredY = clf.predict(TestX) print(clfr(TestY, PredY)) for i in range(len(TestY)): confuseMat[TestY[i]][PredY[i]] += 1 return confuseMat
ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max())
clf = GridSearchCV(svm.LinearSVC(C=1, random_state=1), param_grid, cv=5) #^^FOUND LINEARSVC works best for my data right now... #fit the classifier #clf=svm.SVC(C=C, kernel=kernel, gamma=g) #use this line if only run once, w/out param grid clf.fit(Xlearn,Ylearn) #print optimal parameter set print ("Optimal Parameters:", clf.best_params_) #make predictions using model Yhat=clf.predict(Xtest) #expected outcomes, using the model #Yhat=clf.predict(Xlearn) #see if it can predict the training ones right at all (if not, 3 features are currently garbage) Yd=clf.decision_function(Xtest) #changed Xtest to Xlearn # decision_function is similar to predict_proba, but for LinearSVC (bigger # means comp more confident about it's prediction; closer to 0=less confident) #try adding in function to score data points, to see how far off things are from being marked as recipe or not #score=clf.predict_proba(Xtest) #print(score) #compute precision-recall and plot curve precision=dict() recall=dict() average_precision=dict() for i in range(2): #2 b/c have two target values (recipe or not) precision[i], recall[i],_=precision_recall_curve(Ytest,Yd) #CHANGED TO YLEARN FROM YTEST average_precision[i]=average_precision_score(Ytest,Yd)
fig = visualize_cv(clf) plt.show() elif action == 3: # run svm on test set assert len( sys.argv) > 4 and sys.argv[3] == '--label', 'Label unspecified' label = int(sys.argv[4]) with open(model_path.format(label), 'rb') as f: clf = pickle.load(f) with open(scaler_path, 'rb') as f: mas = pickle.load(f) X = np.load(feature_test_path) Y = np.load(label_test_path) X_scaled = mas.transform(X) # probs = clf.predict_proba(X) probs = clf.decision_function(X_scaled) metrics = eval_prediction(probs, Y[:, label]) # plt.clf() # plt.xlabel('Recall') # plt.ylabel('Precision') # plt.ylim([0, 1.05]) # plt.xlim([0, 1.]) # plt.plot(metrics['pr'].recall, metrics['pr'].precision, lw=1) # plt.show() print(metrics) print('> AP = ' + str(metrics['ap'])) ''' elif action == 2: # features visualization assert len(sys.argv) > 4 and sys.argv[3] == '--label', 'Label unspecified' label = int(sys.argv[4])
print("AUC for SVC: {:.3f}".format(svc_auc)) #在模型选择中使用评估指标 roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="roc_auc") print("AUC scoring: {}".format(roc_auc)) X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target == 9, random_state=0) param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]} grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(X_train, y_train) print("\nGrid-Search with AUC") print("Best parameters:", grid.best_params_) print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test))) ###算法链与管道,将模型、预处理、数据划分集合在一起,数据划为训练部分、验证部分、测试部分 #简单管道 from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) #scaler是MinMaxScaler()的实例,svm是SVC()的实例 pipe.fit(X_train, y_train) print("Test score: {:.2f}".format(pipe.score(X_test, y_test))) #在网格搜索中使用管道 from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], #“模型__参数” 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) #交叉验证5次
print "X_train shape =", X_train.shape, " y_train shape=", y_train.shape #print "X_test shape =", X==.shape, " y_test shape=", y_test.shape print """ The following lines train the SVM using our extracted training dataset and is parameterized based on the gridding results. Then the trained SVM is used to carry out predictions on the test data set. The percentage of accuracy predictions is printed """ clf = svm.SVC(kernel='rbf', C=10, gamma = 0.00001, degree = 3.0, coef0 = 0.0).fit(X_train, y_train) print "clf.get_params(deep=True) =", clf.get_params(deep=True) print "clf.score(X_test, y_test) = {0}%".format(int((clf.score(X_test, y_test) * 10000))/100.) print "clf.predict(X_test) = ", clf.predict(X_test) print "clf.decision_function(X_test) = ", clf.decision_function(X_test) print "=======================" print "clf.score(X_train, y_train) = {0}%".format(int((clf.score(X_train, y_train) * 10000))/100.) print "clf.predict(X_train) = ", clf.predict(X_train) print "clf.decision_function(X_train) = ", clf.decision_function(X_train) print "=======================" print print print "#####################################" """ http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC """ print "clf.support_ = ", clf.support_ print print "len(clf.support_) = ", len(clf.support_) print
class TensorParser(object): def train(self, train_set): logger.info("Converting features to list") features = [] values = [] for source, examples in train_set: group_features = list(self.get_features(examples)) values += [feature[1] for feature in group_features] group_features = [feature[0] for feature in group_features] features += group_features logger.info("Total number of instances: %d", len(features)) assert len(features) == len(values) logger.info("Training - building vectors") self.vectorizer = DictVectorizer() vectors = self.vectorizer.fit_transform(features) logger.info("Training classifier") svm = LinearSVC() parameters = {'C': [0.1, 1.0, 10.0, 100.0]} self.classifier = GridSearchCV(svm, parameters, scoring='f1') self.classifier.fit(vectors, values) self.classifier = self.classifier.best_estimator_ logger.info("SVM classes: %r", self.classifier.classes_) feature_scores = self.vectorizer.inverse_transform(self.classifier.coef_) best_features = sorted(feature_scores[0].iteritems(), key=itemgetter(1), reverse=True) logger.debug("Top SVM parameters: %r", best_features[:100]) logger.debug("Top negative SVM parameters: %r", best_features[::-1][:100]) logger.info("Finished training") def test(self, test_set): logger.info("Evaluation on test set") count = 0 results = [] for source, group in test_set: data = self.get_features(group) features, values = zip(*list(data)) vectors = self.vectorizer.transform(features) predictions = self.classifier.decision_function(vectors) best_index = np.argmax(predictions) results.append(group[best_index]) count += 1 if count % 100 == 0: logger.info("Processed %d items", count) return results def get_features(self, examples): for example in examples: features = self.get_example_features(example) yield features, example['score'] > 0.0 def get_example_features(self, example): source_tokens = self.get_sentence_features(example['source']) target_tokens = self.get_sentence_features(example['target']) features = [] for source in source_tokens: for target in target_tokens: features.append(source + ':' + target) return {f: 1.0 for f in features} def get_sentence_features(self, sentence): tokens = tokenize(sentence) return [token for token in tokens if token not in STOPWORDS] def __repr__(self): return type(self).__name__
class TensorSystem(object): def __init__(self, oracle_class=OracleSystem): self.random = random.Random(1) self.connector = Connector() self.possible_connections = None self.oracle_class = oracle_class self.expression_features = {} def set_best_expression_set(self, train_set): expression_counts = Counter() for expressions in self.query_expressions.values(): expression_counts.update(expressions) logger.info("Found %d unique expressions", len(expression_counts)) self.frequent_expressions = set() covered = 0 uncovered_set = train_set while len(uncovered_set) > 0: frequent = expression_counts.most_common(1)[0][0] self.frequent_expressions.add(frequent) logger.info("Most frequent expression: %r", frequent) covered = 0 removed = Counter() new_uncovered_set = [] new_expression_counts = Counter() for query, target in uncovered_set: _, oracle_expressions = self.oracle.get_best_results_and_expressions(query) oracle_expressions = set(oracle_expressions) if frequent not in oracle_expressions and len(oracle_expressions) > 0: new_uncovered_set.append((query, target)) new_expression_counts.update(oracle_expressions) uncovered_set = new_uncovered_set expression_counts = new_expression_counts logger.info("Frequent expressions: %d, uncovered: %d, expressions_remaining: %d", len(self.frequent_expressions), len(uncovered_set), len(expression_counts)) def train(self, train_set): logger.info("Training tensor based classifier") self.oracle = self.oracle_class(train_set) self.query_expressions = {} for query, target in train_set: _, expressions = self.oracle.get_best_results_and_expressions(query) if len(expressions) == 0: continue self.query_expressions[query] = expressions logger.info("Obtained %d items from oracle", len(self.query_expressions)) features = [] values = [] self.set_best_expression_set(train_set) all_features = [] values = [] for query, correct_expressions in self.query_expressions.iteritems(): logger.debug("Building features for query %r, %d correct expressions", query, len(correct_expressions)) query_tokens = self.get_sentence_features(query) for expression in correct_expressions & self.frequent_expressions: features = self.get_query_expression_features(query_tokens, expression) all_features.append(features) values.append(1) for expression in self.frequent_expressions - correct_expressions: features = self.get_query_expression_features(query_tokens, expression) all_features.append(features) values.append(0) self.frequent_expressions = list(self.frequent_expressions) logger.info("Training - building vectors with %d features", len(all_features)) self.vectorizer = DictVectorizer() vectors = self.vectorizer.fit_transform(all_features) logger.info("Training classifier") #svm = LinearSVC(random_state=1, tol=1e-5) svm = LinearSVC(tol=1e-6) parameters = {'C': [0.1, 1.0, 10.0, 100.0]} self.classifier = GridSearchCV(svm, parameters, scoring='f1') self.classifier.fit(vectors, values) logger.info("Best score in cross validation: %f", self.classifier.best_score_) self.classifier = self.classifier.best_estimator_ logger.info("SVM classes: %r", self.classifier.classes_) self.feature_scores = self.vectorizer.inverse_transform(self.classifier.coef_)[0] best_features = sorted(self.feature_scores.iteritems(), key=itemgetter(1), reverse=True) logger.debug("Top SVM parameters: %r", best_features[:100]) logger.debug("Top negative SVM parameters: %r", best_features[::-1][:100]) logger.info("Finished training") def make_features(self, all_query_tokens, all_connections): return [self.get_query_expression_features(query_tokens, connection) for query_tokens, connection in zip(all_query_tokens, all_connections)] def get_best_expressions(self, query): query_features = self.get_sentence_features(query) logger.debug("Query features: %r", query_features) all_features = [self.get_query_expression_features(query_features, expression) for expression in self.frequent_expressions] vectors = self.vectorizer.transform(all_features) predictions = self.classifier.decision_function(vectors) best_indices = np.argsort(predictions)[::-1] best_expressions = [self.frequent_expressions[i] for i in best_indices] return best_expressions # random_expressions = list(self.all_expressions) # random.shuffle(random_expressions) # return random_expressions def execute(self, query): logger.debug("Executing query: %r", query) best_expressions = self.get_best_expressions(query) entities = self.connector.get_query_entities(query) for expression in best_expressions: try: result_ids = expression.apply(entities, self.connector.related) except Exception: logger.exception("Exception applying expression") result_ids = [] result = set(self.connector.related.get_names(result) for result in result_ids) logger.debug("Searching for best expression, expression: %r, result: %r", expression, result) if len(result) > 0: return result return set() def get_expression_features(self, expression): if expression in self.expression_features: return self.expression_features[expression] try: connections = [expression.connection] except AttributeError: connections = [expression.expression1.connection, expression.expression2.connection] relations = reduce(list.__add__, [list(c) for c in connections]) connection_names= [self.connector.related.get_names(relation) for relation in relations] logger.info("Connections: %r, Connection names: %s", connections, connection_names) pseudo_sentence = ' '.join(connection_names) features = self.get_sentence_features(pseudo_sentence) self.expression_features[expression] = features return features def get_query_expression_features(self, query, expression): expression_features = self.get_expression_features(expression) return self.get_tensor_features(query, expression_features) def get_tensor_features(self, source_tokens, target_tokens): features = [] for source in source_tokens: for target in target_tokens: features.append(source + ':' + target) return {f: 1.0 for f in features} def get_sentence_features(self, sentence): tokens = tokenize(sentence) return [token for token in tokens if token not in STOPWORDS] def __repr__(self): return type(self).__name__
class TensorSystem(object): def __init__(self, oracle_class=OracleSystem): self.random = random.Random(1) self.connector = Connector() self.possible_connections = None self.oracle_class = oracle_class self.expression_features = {} def set_best_expression_set(self, train_set): expression_counts = Counter() for expressions in self.query_expressions.values(): expression_counts.update(expressions) logger.info("Found %d unique expressions", len(expression_counts)) self.frequent_expressions = set() covered = 0 uncovered_set = train_set while len(uncovered_set) > 0: frequent = expression_counts.most_common(1)[0][0] self.frequent_expressions.add(frequent) logger.info("Most frequent expression: %r", frequent) covered = 0 removed = Counter() new_uncovered_set = [] new_expression_counts = Counter() for query, target in uncovered_set: _, oracle_expressions = self.oracle.get_best_results_and_expressions( query) oracle_expressions = set(oracle_expressions) if frequent not in oracle_expressions and len( oracle_expressions) > 0: new_uncovered_set.append((query, target)) new_expression_counts.update(oracle_expressions) uncovered_set = new_uncovered_set expression_counts = new_expression_counts logger.info( "Frequent expressions: %d, uncovered: %d, expressions_remaining: %d", len(self.frequent_expressions), len(uncovered_set), len(expression_counts)) def train(self, train_set): logger.info("Training tensor based classifier") self.oracle = self.oracle_class(train_set) self.query_expressions = {} for query, target in train_set: _, expressions = self.oracle.get_best_results_and_expressions( query) if len(expressions) == 0: continue self.query_expressions[query] = expressions logger.info("Obtained %d items from oracle", len(self.query_expressions)) features = [] values = [] self.set_best_expression_set(train_set) all_features = [] values = [] for query, correct_expressions in self.query_expressions.iteritems(): logger.debug( "Building features for query %r, %d correct expressions", query, len(correct_expressions)) query_tokens = self.get_sentence_features(query) for expression in correct_expressions & self.frequent_expressions: features = self.get_query_expression_features( query_tokens, expression) all_features.append(features) values.append(1) for expression in self.frequent_expressions - correct_expressions: features = self.get_query_expression_features( query_tokens, expression) all_features.append(features) values.append(0) self.frequent_expressions = list(self.frequent_expressions) logger.info("Training - building vectors with %d features", len(all_features)) self.vectorizer = DictVectorizer() vectors = self.vectorizer.fit_transform(all_features) logger.info("Training classifier") #svm = LinearSVC(random_state=1, tol=1e-5) svm = LinearSVC(tol=1e-6) parameters = {'C': [0.1, 1.0, 10.0, 100.0]} self.classifier = GridSearchCV(svm, parameters, scoring='f1') self.classifier.fit(vectors, values) logger.info("Best score in cross validation: %f", self.classifier.best_score_) self.classifier = self.classifier.best_estimator_ logger.info("SVM classes: %r", self.classifier.classes_) self.feature_scores = self.vectorizer.inverse_transform( self.classifier.coef_)[0] best_features = sorted(self.feature_scores.iteritems(), key=itemgetter(1), reverse=True) logger.debug("Top SVM parameters: %r", best_features[:100]) logger.debug("Top negative SVM parameters: %r", best_features[::-1][:100]) logger.info("Finished training") def make_features(self, all_query_tokens, all_connections): return [ self.get_query_expression_features(query_tokens, connection) for query_tokens, connection in zip(all_query_tokens, all_connections) ] def get_best_expressions(self, query): query_features = self.get_sentence_features(query) logger.debug("Query features: %r", query_features) all_features = [ self.get_query_expression_features(query_features, expression) for expression in self.frequent_expressions ] vectors = self.vectorizer.transform(all_features) predictions = self.classifier.decision_function(vectors) best_indices = np.argsort(predictions)[::-1] best_expressions = [self.frequent_expressions[i] for i in best_indices] return best_expressions # random_expressions = list(self.all_expressions) # random.shuffle(random_expressions) # return random_expressions def execute(self, query): logger.debug("Executing query: %r", query) best_expressions = self.get_best_expressions(query) entities = self.connector.get_query_entities(query) for expression in best_expressions: try: result_ids = expression.apply(entities, self.connector.related) except Exception: logger.exception("Exception applying expression") result_ids = [] result = set( self.connector.related.get_names(result) for result in result_ids) logger.debug( "Searching for best expression, expression: %r, result: %r", expression, result) if len(result) > 0: return result return set() def get_expression_features(self, expression): if expression in self.expression_features: return self.expression_features[expression] try: connections = [expression.connection] except AttributeError: connections = [ expression.expression1.connection, expression.expression2.connection ] relations = reduce(list.__add__, [list(c) for c in connections]) connection_names = [ self.connector.related.get_names(relation) for relation in relations ] logger.info("Connections: %r, Connection names: %s", connections, connection_names) pseudo_sentence = ' '.join(connection_names) features = self.get_sentence_features(pseudo_sentence) self.expression_features[expression] = features return features def get_query_expression_features(self, query, expression): expression_features = self.get_expression_features(expression) return self.get_tensor_features(query, expression_features) def get_tensor_features(self, source_tokens, target_tokens): features = [] for source in source_tokens: for target in target_tokens: features.append(source + ':' + target) return {f: 1.0 for f in features} def get_sentence_features(self, sentence): tokens = tokenize(sentence) return [token for token in tokens if token not in STOPWORDS] def __repr__(self): return type(self).__name__
class WangBaseSenser(BaseSenser): """Abstract class for disambiguating relation senses. Attributes: n_y (int): number of distinct classes """ __metaclass__ = abc.ABCMeta # private members PARAM_GRID = {"clf__C": [float(i)/100. for i in xrange(1, 3)]} N_JOBS = -1 def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Initialize classifier. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf or LinearSVC(C=DFLT_C, **DFLT_PARAMS) self._gs = a_grid_search self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)]) def train(self, a_train_data, a_dev_data=None, a_n_y=-1, a_i=-1, a_train_out=None, a_dev_out=None): """Method for training the model. Args: a_train_data (tuple[list, dict]): list of training JSON data a_dev_data (tuple[list, dict] or None): list of development JSON data a_n_y (int): number of distinct classes a_i (int): row index for the output predictions a_train_out (np.array or None): predictions for the training set a_dev_out (np.array or None): predictions for the training set Returns: void: Note: updates ``a_train_out`` and ``a_dev_out`` in place """ self.n_y = a_n_y x_train, y_train = self._generate_ts(a_train_data) x_dev, y_dev = self._generate_ts(a_dev_data) # determine cross-validation and grid-search strategy and fit the model if self._gs: if a_dev_data is None or not a_dev_data[0]: cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True) else: cv = self._devset_cv(y_train, len(y_dev), NFOLDS) x_train = x_train + x_dev y_train = y_train + y_dev scorer = make_scorer(f1_score, average="macro") self._model = GridSearchCV(self._model, self.PARAM_GRID, scoring=scorer, cv=cv, n_jobs=self.N_JOBS, verbose=1) self._model.fit([el[-1] for el in x_train], y_train) # output best hyper-parameters if self._gs: print("Best params:", repr(self._model.best_params_), file=sys.stderr) if a_i >= 0: if a_train_out is not None: if self._gs and a_dev_data and a_dev_data[0]: x_train = x_train[:-len(x_dev)] for i, x_i in x_train: self._predict(x_i, a_train_out[i], a_i) if a_dev_out is not None: for i, x_i in x_dev: self._predict(x_i, a_dev_out[i], a_i) def predict(self, a_rel, a_data, a_ret, a_i): """Method for predicting sense of single relation. Args: a_rel (dict): discourse relation whose sense should be predicted a_data (2-tuple(dict, dict)): list of input JSON data a_ret (np.array): output prediction vector a_i (int): row index in the output vector Returns: void: Note: updates ``a_ret[a_i]`` in place """ feats = self._extract_features(a_rel, a_data[-1]) self._predict(feats, a_ret, a_i) @abc.abstractmethod def _extract_features(self, a_rel, a_parses): """Extract classification features for a given relation. Args: a_rel (dict): discourse relation to extract features for a_parses (dict): parsed sentences Returns: void: """ raise NotImplementedError def _predict(self, a_feats, a_ret, a_i): """Method for predicting sense of single relation. Args: a_feats (dict): features of the input instance a_ret (np.array): output prediction vector a_i (int): row index in the output vector Returns: void: updates ``a_ret[a_i]`` in place """ # obtain model's estimates dec = self._model.decision_function(a_feats) if len(dec.shape) > 1: dec = np.mean(dec, axis=0) # normalize using softmax dec = np.exp(dec) exp_ret = np.sum(dec) or 1e10 dec /= exp_ret # map model's classes to original indices for i, ival in enumerate(dec): a_ret[a_i][self._model.classes_[i]] += ival def _free(self): """Free resources used by the model. """ self.n_y = -1 def _generate_ts(self, a_data): """Generate training set. Args: a_data (2-tuple(list, dict)): input data (discourse relations and parses) Returns: tuple(list, list): lists of input features and expected classes """ x, y = [], [] if a_data is None: return (x, y) x_i = y_i = None # generate features for i, irel in a_data[0]: x_i = self._extract_features(irel, a_data[1]) if not x_i: continue x.append((i, x_i)) y_i = np.argmax(irel[SENSE]) y.append(y_i) return (x, y) def _devset_cv(self, a_y_train, a_n_dev, a_n_folds): """Generate train-test split from training and development data. Args: a_y_train (list[int]): list of training instances' tags a_n_dev (int): number of devset instances a_n_folds (int): number of folds Returrns: list[tuple]: list of training/testing folds """ folds = [] n_train = len(a_y_train) dev_ids = [n_train + i for i in xrange(a_n_dev)] # create stratified K-folds over the training data skf = StratifiedKFold(a_y_train, a_n_folds) for train_ids, test_ids in skf: folds.append((train_ids, np.concatenate((test_ids, dev_ids)))) return folds
print "Ridge Classifier:" # print dtc.best_params_ # print dtc.grid_scores_ print "\tTraining accuracy: " + str(rc.score(Train_feat, yTrain) * 100) print "\tTesting accuracy: " + str(rc.score(Test_feat, yTest) * 100) print confusion_matrix(rc.predict(Test_feat), yTest) # print 'auc: first predict first then real first' # print metrics.roc_auc_score(rc.predict(Test_feat), yTest) # print metrics.roc_auc_score(yTest, rc.predict(Test_feat)) print "\n" # print rc.decision_function(Test_feat).shape # sys.exit() add_to_curve_c(rc.decision_function(Test_feat), metrics.roc_auc_score(yTest, rc.decision_function(Test_feat), average='weighted'),yTrain, yTest, 'Ridge Regression ') # print "sgd Classifier:" # # print dtc.best_params_ # # print dtc.grid_scores_ # print "\tTraining accuracy: " + str(sgdclas.score(Train_feat, yTrain) * 100) # print "\tTesting accuracy: " + str(sgdclas.score(Test_feat, yTest) * 100) # print confusion_matrix(sgdclas.predict(Test_feat), yTest) # print 'auc:' # print metrics.roc_auc_score(sgdclas.predict(Test_feat), yTest) # print "\n" # add_to_curve(sgdclas.predict_proba(Test_feat), metrics.roc_auc_score(yTest, sgdclas.predict_proba(Test_feat)[:,1]),yTrain, yTest, 'SGD based Logistic Regression') print "log regress Classifier:"
def run(): # Create log file and grab script text create_log() script_text = get_file_text('run.py') # Create output directory if it does not exist if not os.path.isdir(OUTPUTS_DIR): os.mkdir(OUTPUTS_DIR) # The code below follows a performance estimation procedure suggested by the following # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning # Load ROIs roi_names = load_roi_names(FILE_HC_SZ) rois = {} for roi_name in roi_names: roi = load_roi( os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt')) for i in roi.index: diagnosis = roi.loc[i, 'diagnosis1'] roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1) roi['diagnosis1'] = roi['diagnosis1'].astype(int) rois[roi_name] = roi log('added ROI: {}'.format(roi_name)) # Define parameter range for grid search later param_grid = [{'C': [2**x for x in range(-5, 15, 2)]}] # Get subject IDs and labels roi = rois[roi_names[0]] subject_ids = roi.index subject_labels = roi['diagnosis1'] log('nr. subjects: {}'.format(len(subject_ids))) scores_pred = [] scores_dist = [] fold = 1 # This outer CV loop is meant for averaging scores for train, test in StratifiedKFold(subject_labels, n_folds=10, shuffle=True): predictions_file = 'outputs/predictions_train{}.txt'.format(fold) distances_file = 'outputs/distances_train{}.txt'.format(fold) if not os.path.isfile(predictions_file): # Create empty tables for holding predictions and distances predictions = dict() predictions['diagnosis'] = subject_labels[train] distances = dict() distances['diagnosis'] = subject_labels[train] # Run through all ROIs for roi_name in roi_names: log('calculating out-of-sample predictions for {}'.format( roi_name)) # Initialize prediction table for this ROI's column predictions[roi_name] = [] distances[roi_name] = [] # Get training data from the data frame X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Use 4-fold CV to get out-of-sample predictions for all training points i = 1 for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): # Do grid search to find optimal C parameter classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X[train1], y[train1]) # Store predictions and distances for this ROI y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) y_dist = classifier.decision_function(X[test1]) distances[roi_name].extend(y_dist) print(' step {} - {}'.format(i, 4)) i += 1 # Save predictions to file log('saving file: {}'.format(predictions_file)) predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv(predictions_file, index_label='id') # Save distances to file log('saving file: {}'.format(distances_file)) distances = pd.DataFrame(distances, index=subject_ids[train]) distances.to_csv(distances_file, index_label='id') # --------------------- param_grid_rbf = [{ 'C': [2**x for x in range(-5, 15, 2)], 'gamma': [2**x for x in range(-15, 4, 2)] }] # Train classifier on predictions log('training level-2 prediction classifier') predictions = pd.read_csv(predictions_file, index_col='id') X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_pred.fit(X, y) log('saving level-2 prediction classifier') joblib.dump(classifier_pred, 'outputs/classifier_pred{}.pkl'.format(fold)) # Train classifier on distances log('training level-2 distance classifier') distances = pd.read_csv(distances_file, index_col='id') X, y = get_xy(distances, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_dist.fit(X, y) log('saving level-2 distance classifier') joblib.dump(classifier_pred, 'outputs/classifier_dist{}.pkl'.format(fold)) # --------------------- # Train each ROI classifier on all training points and save it to disk for roi_name in roi_names: log('training {} on all training points'.format(roi_name)) # Skip this step if exported classifier already exists classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format( fold) if os.path.isfile(classifier_file): continue # Get training data for this fold X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Train classifier using grid search classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X, y) # Save best classifier to file log('saving {} classifier to disk'.format(roi_name)) joblib.dump(classifier, classifier_file) # --------------------- # Load ROI classifiers from file classifiers = {} for roi_name in roi_names: classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format( fold) classifiers[roi_name] = joblib.load(classifier_file) # --------------------- predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold) distances_test_file = 'outputs/distances_test{}.txt'.format(fold) if not os.path.isfile(predictions_test_file): predictions_test = dict() predictions_test['diagnosis'] = subject_labels[test] distances_test = dict() distances_test['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions_test[roi_name] = [] distances_test[roi_name] = [] # Get test data from the data frame X, y = get_xy(rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) log('calculating predictions and distances for {}'.format( roi_name)) # Store predictions and distances y_pred = classifiers[roi_name].predict(X) predictions_test[roi_name].extend(y_pred) y_dist = classifiers[roi_name].decision_function(X) distances_test[roi_name].extend(y_dist) # Save predictions to file log('saving predictions to file') predictions_test = pd.DataFrame(predictions_test, index=subject_ids[test]) predictions_test.to_csv(predictions_test_file, index_label='id') # Save distances to file log('saving distances to file') distances_test = pd.DataFrame(distances_test, index=subject_ids[test]) distances_test.to_csv(distances_test_file, index_label='id') # --------------------- # Load prediction classifier and run it on test predictions predictions_test = pd.read_csv(predictions_test_file, index_col='id') X_test, y_test = get_xy(predictions_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = joblib.load( 'outputs/classifier_pred{}.pkl'.format(fold)) y_pred = classifier_pred.predict(X_test) scores_pred.append(accuracy_score(y_test, y_pred)) log('score: {} (predictions)'.format(scores_pred[-1])) # Load distance classifier and run it on test distances distances_test = pd.read_csv(distances_file, index_col='id') X_test, y_test = get_xy(distances_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = joblib.load( 'outputs/classifier_dist{}.pkl'.format(fold)) y_pred = classifier_dist.predict(X_test) scores_dist.append(accuracy_score(y_test, y_pred)) log('score: {} (distances)'.format(scores_dist[-1])) fold += 1 log('overall score: {} (predictions)'.format(np.mean(scores_pred))) log('overall score: {} (distances)'.format(np.mean(scores_dist))) # Append script to log and close it add_text_to_log(script_text) finish_log()
def run(): # Create log file and grab script text create_log() script_text = get_file_text('run.py') # Create output directory if it does not exist if not os.path.isdir(OUTPUTS_DIR): os.mkdir(OUTPUTS_DIR) # The code below follows a performance estimation procedure suggested by the following # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning # Load ROIs roi_names = load_roi_names(FILE_HC_SZ) rois = {} for roi_name in roi_names: roi = load_roi(os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt')) for i in roi.index: diagnosis = roi.loc[i, 'diagnosis1'] roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1) roi['diagnosis1'] = roi['diagnosis1'].astype(int) rois[roi_name] = roi log('added ROI: {}'.format(roi_name)) # Define parameter range for grid search later param_grid = [{ 'C': [2**x for x in range(-5, 15, 2)]}] # Get subject IDs and labels roi = rois[roi_names[0]] subject_ids = roi.index subject_labels = roi['diagnosis1'] log('nr. subjects: {}'.format(len(subject_ids))) scores_pred = [] scores_dist = [] fold = 1 # This outer CV loop is meant for averaging scores for train, test in StratifiedKFold(subject_labels, n_folds=10, shuffle=True): predictions_file = 'outputs/predictions_train{}.txt'.format(fold) distances_file = 'outputs/distances_train{}.txt'.format(fold) if not os.path.isfile(predictions_file): # Create empty tables for holding predictions and distances predictions = dict() predictions['diagnosis'] = subject_labels[train] distances = dict() distances['diagnosis'] = subject_labels[train] # Run through all ROIs for roi_name in roi_names: log('calculating out-of-sample predictions for {}'.format(roi_name)) # Initialize prediction table for this ROI's column predictions[roi_name] = [] distances[roi_name] = [] # Get training data from the data frame X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Use 4-fold CV to get out-of-sample predictions for all training points i = 1 for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): # Do grid search to find optimal C parameter classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X[train1], y[train1]) # Store predictions and distances for this ROI y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) y_dist = classifier.decision_function(X[test1]) distances[roi_name].extend(y_dist) print(' step {} - {}'.format(i, 4)) i += 1 # Save predictions to file log('saving file: {}'.format(predictions_file)) predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv(predictions_file, index_label='id') # Save distances to file log('saving file: {}'.format(distances_file)) distances = pd.DataFrame(distances, index=subject_ids[train]) distances.to_csv(distances_file, index_label='id') # --------------------- param_grid_rbf = [{ 'C': [2**x for x in range(-5, 15, 2)], 'gamma': [2**x for x in range(-15, 4, 2)]}] # Train classifier on predictions log('training level-2 prediction classifier') predictions = pd.read_csv(predictions_file, index_col='id') X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_pred.fit(X, y) log('saving level-2 prediction classifier') joblib.dump(classifier_pred, 'outputs/classifier_pred{}.pkl'.format(fold)) # Train classifier on distances log('training level-2 distance classifier') distances = pd.read_csv(distances_file, index_col='id') X, y = get_xy(distances, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_dist.fit(X, y) log('saving level-2 distance classifier') joblib.dump(classifier_pred, 'outputs/classifier_dist{}.pkl'.format(fold)) # --------------------- # Train each ROI classifier on all training points and save it to disk for roi_name in roi_names: log('training {} on all training points'.format(roi_name)) # Skip this step if exported classifier already exists classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold) if os.path.isfile(classifier_file): continue # Get training data for this fold X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Train classifier using grid search classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X, y) # Save best classifier to file log('saving {} classifier to disk'.format(roi_name)) joblib.dump(classifier, classifier_file) # --------------------- # Load ROI classifiers from file classifiers = {} for roi_name in roi_names: classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold) classifiers[roi_name] = joblib.load(classifier_file) # --------------------- predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold) distances_test_file = 'outputs/distances_test{}.txt'.format(fold) if not os.path.isfile(predictions_test_file): predictions_test = dict() predictions_test['diagnosis'] = subject_labels[test] distances_test = dict() distances_test['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions_test[roi_name] = [] distances_test[roi_name] = [] # Get test data from the data frame X, y = get_xy( rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) log('calculating predictions and distances for {}'.format(roi_name)) # Store predictions and distances y_pred = classifiers[roi_name].predict(X) predictions_test[roi_name].extend(y_pred) y_dist = classifiers[roi_name].decision_function(X) distances_test[roi_name].extend(y_dist) # Save predictions to file log('saving predictions to file') predictions_test = pd.DataFrame(predictions_test, index=subject_ids[test]) predictions_test.to_csv(predictions_test_file, index_label='id') # Save distances to file log('saving distances to file') distances_test = pd.DataFrame(distances_test, index=subject_ids[test]) distances_test.to_csv(distances_test_file, index_label='id') # --------------------- # Load prediction classifier and run it on test predictions predictions_test = pd.read_csv(predictions_test_file, index_col='id') X_test, y_test = get_xy(predictions_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = joblib.load('outputs/classifier_pred{}.pkl'.format(fold)) y_pred = classifier_pred.predict(X_test) scores_pred.append(accuracy_score(y_test, y_pred)) log('score: {} (predictions)'.format(scores_pred[-1])) # Load distance classifier and run it on test distances distances_test = pd.read_csv(distances_file, index_col='id') X_test, y_test = get_xy(distances_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = joblib.load('outputs/classifier_dist{}.pkl'.format(fold)) y_pred = classifier_dist.predict(X_test) scores_dist.append(accuracy_score(y_test, y_pred)) log('score: {} (distances)'.format(scores_dist[-1])) fold += 1 log('overall score: {} (predictions)'.format(np.mean(scores_pred))) log('overall score: {} (distances)'.format(np.mean(scores_dist))) # Append script to log and close it add_text_to_log(script_text) finish_log()
grid_predictions = clf.predict(X_test) # print(confusion_matrix(y_test, grid_predictions)) # decision function on testing data plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test, s=30) ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = grid.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], ) # plot support vectors ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=10, linewidth=1, c = 'b', facecolors='none', label = 'Support Vectors') plt.title("Decision Function after Grid Search") plt.legend() plt.savefig('grid-search-decision-function-on-testing-data.png', dpi = 600) plt.show()
clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("assigned weights to features: ") print(clf.coef_) #target_names = ['',''] print(classification_report(y_test, y_pred)) #target_names=target_names ''' ''' Save classifier ''' #joblib.dump(clf, 'classifier_cdiff.pkl') ''' Get distance from the hyperplane ''' distance_from_boundry = clf.decision_function(X_test) #print(distance_from_boundry) if (wanna_see_graphs == 1): ''' Stacked graph of results ''' distances_1s = [ i for (i, j) in zip(distance_from_boundry, y_test) if j > 0 ] distances_0s = [ i for (i, j) in zip(distance_from_boundry, y_test) if j <= 0 ] graph_val_neg = [] graph_val_pos = []