Beispiel #1
0
def estimate_best_c():
    """
    Run svm classifier with multiple settings of
    C and plot the accuracy function of C
    :return: the best C setting
    """
    c_range = [10**n for n in range(4)]
    data = load_higgs_train()
    records = [[c] + list(run_svm(data=data, regularization_term=c))
               for c in c_range]
    LOGGER.info('Performed evaluation of the C setting choice')
    columns = ['C', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    LOGGER.info(df)
    return df
Beispiel #2
0
def estimate_best_gamma():
    """
    Run svm classifier with multiple settings of
    gamma and plot the accuracy function of gamma
    :return: the best gamma setting
    """
    gamma_range = np.arange(0.0, 1.0, 0.2)
    data = load_higgs_train()
    records = [[gamma] + list(run_svm(data=data, gamma=gamma))
               for gamma in gamma_range]
    LOGGER.info('Performed evaluation of the gamma setting choice')
    columns = ['gamma', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    LOGGER.info(df)
    return df
Beispiel #3
0
def grid_search_best_parameter(split_dataset, clf, tuned_parameters, scores=('precision', 'recall')):
    scores = scores
    data = []

    for score in scores:
        LOGGER.info("# Tuning hyper-parameters for %s" % score)
        clf = GridSearchCV(clf(), tuned_parameters, cv=5, scoring=score)
        clf.fit(split_dataset['training']['features'], split_dataset['training']['labels'])

        LOGGER.info("Best parameters set found on development set:")
        LOGGER.info(clf.best_estimator_)
        LOGGER.info("Grid scores on development set:")
        for params, mean_score, scores in clf.grid_scores_:
            LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
            record = [score, mean_score] + scores + params
            data.append(record)

        LOGGER.info("Detailed classification report:")
        LOGGER.info("The model is trained on the full development set.")
        LOGGER.info("The scores are computed on the full evaluation set.")
        y_true, y_pred = split_dataset['test']['labels'], clf.predict(split_dataset['test']['features'])
        LOGGER.info(classification_report(y_true, y_pred))
        return pd.DataFrame.from_records(data)
Beispiel #4
0
def load_bidding_test():
    df = pd.read_csv(BIDDING_DATA['test'])
    LOGGER.info('Loaded higgs test dataset of size %s', len(df))
    return df
Beispiel #5
0
def load_higgs_test():
    df = pd.read_csv(HIGGS_DATA['test'])
    LOGGER.info('Loaded higgs test dataset of size %s', len(df))
    return df
Beispiel #6
0
def grid_search_best_parameter(data):
    features, weights, labels = data
    labels = np.array([1 if l == 'b' else 0 for l in labels])
    trnfeatures, tstfeatures, trnweights, tstweights, trnlabels, tstlabels = split_dataset(features, weights, labels)
    # Set the parameters by cross-validation
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]

    scores = ['precision', 'recall']
    reports = {}
    for score in scores:
        LOGGER.info("# Tuning hyper-parameters for %s" % score)
        clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
        clf.fit(trnfeatures, trnlabels)

        LOGGER.info("Best parameters set found on development set:")
        LOGGER.info(clf.best_estimator_)
        LOGGER.info("Grid scores on development set:")
        for params, mean_score, scores in clf.grid_scores_:
            LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

        LOGGER.info("Detailed classification report:")
        LOGGER.info("The model is trained on the full development set.")
        LOGGER.info("The scores are computed on the full evaluation set.")
        y_true, y_pred = tstlabels, clf.predict(tstfeatures)
        reports[score] = classification_report(y_true, y_pred)
    return reports