def estimate_best_c(): """ Run svm classifier with multiple settings of C and plot the accuracy function of C :return: the best C setting """ c_range = [10**n for n in range(4)] data = load_higgs_train() records = [[c] + list(run_svm(data=data, regularization_term=c)) for c in c_range] LOGGER.info('Performed evaluation of the C setting choice') columns = ['C', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) LOGGER.info(df) return df
def estimate_best_gamma(): """ Run svm classifier with multiple settings of gamma and plot the accuracy function of gamma :return: the best gamma setting """ gamma_range = np.arange(0.0, 1.0, 0.2) data = load_higgs_train() records = [[gamma] + list(run_svm(data=data, gamma=gamma)) for gamma in gamma_range] LOGGER.info('Performed evaluation of the gamma setting choice') columns = ['gamma', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) LOGGER.info(df) return df
def grid_search_best_parameter(split_dataset, clf, tuned_parameters, scores=('precision', 'recall')): scores = scores data = [] for score in scores: LOGGER.info("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(clf(), tuned_parameters, cv=5, scoring=score) clf.fit(split_dataset['training']['features'], split_dataset['training']['labels']) LOGGER.info("Best parameters set found on development set:") LOGGER.info(clf.best_estimator_) LOGGER.info("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) record = [score, mean_score] + scores + params data.append(record) LOGGER.info("Detailed classification report:") LOGGER.info("The model is trained on the full development set.") LOGGER.info("The scores are computed on the full evaluation set.") y_true, y_pred = split_dataset['test']['labels'], clf.predict(split_dataset['test']['features']) LOGGER.info(classification_report(y_true, y_pred)) return pd.DataFrame.from_records(data)
def load_bidding_test(): df = pd.read_csv(BIDDING_DATA['test']) LOGGER.info('Loaded higgs test dataset of size %s', len(df)) return df
def load_higgs_test(): df = pd.read_csv(HIGGS_DATA['test']) LOGGER.info('Loaded higgs test dataset of size %s', len(df)) return df
def grid_search_best_parameter(data): features, weights, labels = data labels = np.array([1 if l == 'b' else 0 for l in labels]) trnfeatures, tstfeatures, trnweights, tstweights, trnlabels, tstlabels = split_dataset(features, weights, labels) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] reports = {} for score in scores: LOGGER.info("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(trnfeatures, trnlabels) LOGGER.info("Best parameters set found on development set:") LOGGER.info(clf.best_estimator_) LOGGER.info("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) LOGGER.info("Detailed classification report:") LOGGER.info("The model is trained on the full development set.") LOGGER.info("The scores are computed on the full evaluation set.") y_true, y_pred = tstlabels, clf.predict(tstfeatures) reports[score] = classification_report(y_true, y_pred) return reports