예제 #1
0
 def test_clf_default(self):
     clf = utils.clf_default()
     self.assertFalse(clf.estimator.class_weight)
     clf = utils.clf_default([-1, 1])
     self.assertFalse(clf.estimator.class_weight)
     clf = utils.clf_default([0, 1])
     self.assertTrue(clf.estimator.class_weight)
예제 #2
0
def simulated_open_world(scenario_obj, auc_bound, binary, bg_size,
                         current_sites):
    '''@return metrics for open world experiment'''
    if current_sites:  # necessary? todo: write test before removal
        scenario_obj.traces = sites.clean(scenario_obj.get_traces(
            current_sites))
    try:
        scenario_obj = scenario_obj.get_open_world(num=bg_size, same=True,
                                                   current_sites=current_sites)
    except ValueError:
        logging.error("no fitting background set found for %r", scenario_obj)
        raise
    # if binary:
    #    scenario_obj = scenario_obj.binarized()
    X, y, d = scenario_obj.get_features_cumul(current_sites)
    X = preprocessing.MinMaxScaler().fit_transform(X)  # scaling is idempotent
    (clf_noprob, accuracy, _) = fit.my_grid(X, y, auc_bound=auc_bound)
    y_pred = model_selection.cross_val_predict(
        clf_noprob, X, y, cv=config.FOLDS, n_jobs=config.JOBS_NUM)
    confmat = metrics.confusion_matrix(y, y_pred)
    (tpr, fpr, tpa) = mymetrics.tpr_fpr_tpa(_binmat(confmat))[1]
    C = clf_noprob.estimator.C
    gamma = clf_noprob.estimator.gamma
    if binary:  # can (easily) compute auroc
        clf = utils.clf_default(y, C=C, gamma=gamma, probability=True)
        y_pred = model_selection.cross_val_predict(
            clf, X, y, cv=config.FOLDS, n_jobs=config.JOBS_NUM,
            method="predict_proba")
        auroc = metrics.roc_auc_score(
            mymetrics.binarize(y, transform_to=1),
            mymetrics.binarize_probability(y_pred)[:, 1],
            max_fpr=auc_bound)
    else:
        auroc = None
    return (tpr, fpr, auroc, C, gamma, accuracy, y, y_pred, d)
예제 #3
0
def sci_grid(X,
             y,
             C=2**14,
             gamma=2**-10,
             step=2,
             scoring=None,
             probability=False,
             simple=False):
    '''(scikit-)grid-search on fixed params, searching laterally and in depth

    @param X,y,C,gamma,folds as for the classifier
    @param step exponential step size, c-range = [c/2**step, c, c*2**step], etc
    @param grid_args: arguments for grid-search, f.ex. scorer

    @return gridsearchcv classifier (with .best_score and .best_params)
    >>> test = sci_grid([[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1]], [0, 0, 0, 1, 1, 1], 0.0001, 0.000001); test.best_score_
    1.0
    '''
    if simple:
        clf = model_selection.GridSearchCV(estimator=utils.clf_default(
            y, probability=True),
                                           param_grid={
                                               "estimator__C":
                                               np.logspace(-3,
                                                           3,
                                                           base=2,
                                                           num=7),
                                               "estimator__gamma":
                                               np.logspace(-3,
                                                           3,
                                                           base=2,
                                                           num=7)
                                           },
                                           n_jobs=config.JOBS_NUM,
                                           verbose=config.VERBOSE,
                                           cv=config.FOLDS,
                                           scoring="roc_auc")
        clf.fit(X, y)
        return clf
    previous = []

    clf = _sci_fit(C, gamma, step, X, y, scoring, probability)
    while not _stop(y, step, clf.best_score_, previous):
        logging.info('C: %s, gamma: %s, step: %s, score: %f',
                     clf.best_params_['estimator__C'],
                     clf.best_params_['estimator__gamma'], step,
                     clf.best_score_)
        if _sci_best_at_border(clf):
            pass  # keep step, search laterally
        else:
            step = step / 2.
        previous.append(clf.best_score_)
        clf = _sci_fit(clf.best_params_['estimator__C'],
                       clf.best_params_['estimator__gamma'], step, X, y)
    return clf
예제 #4
0
def _sci_fit(C, gamma, step, X, y, scoring=None, probability=False):
    '''@return appropriate gridsearchcv, fitted with X and y'''
    cs = _search_range(C, step)
    gammas = _search_range(gamma, step)
    clf = model_selection.GridSearchCV(estimator=utils.clf_default(
        y, probability=probability),
                                       param_grid={
                                           "estimator__C": cs,
                                           "estimator__gamma": gammas
                                       },
                                       n_jobs=config.JOBS_NUM,
                                       verbose=config.VERBOSE,
                                       cv=config.FOLDS,
                                       scoring=scoring)
    return clf.fit(X, y)
예제 #5
0
def my_grid(X,
            y,
            C=2**4,
            gamma=2**-4,
            step=2,
            results=None,
            auc_bound=None,
            previous=None,
            folds=config.FOLDS,
            delta=0.01):
    '''@param results are previously computed results {(C,gamma): accuracy, ...}
    @param auc_bound if set, use the bounded auc score with this y_bound
    @return Result(clf, best_score_, results) (namedtuple see above)'''
    global scaler
    if not results:
        previous = []
        results = {}
        scaler = None  # guesstimate: one grid search per data set (TD: refact)
    bestclf = None
    bestres = np.array([0])
    for c in _search_range(C, step):
        for g in _search_range(gamma, step):
            clf = utils.clf_default(y,
                                    gamma=g,
                                    C=c,
                                    probability=(True if auc_bound else False))
            if (c, g) in results:
                current = results[(c, g)]
            else:
                if auc_bound:
                    current = _bounded_auc_eval(X, y, clf, auc_bound)
                else:
                    current = _eval(X, y, clf, folds=folds)
                results[(c, g)] = current
            if not bestclf or bestres < current:
                bestclf = clf
                bestres = current
            logging.info('c: %8s g: %15s res: %.6f', c, g, current.mean())
    previous.append(np.mean(bestres))
    if _stop(y, step, np.mean(bestres), previous, C):
        if collections.Counter(results.values())[bestres] > 1:
            logging.warn('more than 1 optimal result, "middle" clf returned')
            best_C, best_gamma = _middle(results, np.mean(bestres))
            bestclf = utils.clf_default(
                y,
                C=best_C,
                gamma=best_gamma,
                probability=(True if auc_bound else False))
        logging.info('grid result: %s', bestclf)
        return Result(bestclf, np.mean(bestres), results)
    best_C = best_gamma = None
    if collections.Counter(results.values())[bestres] > 1:
        logging.warn("more than 1 optimal result")
        best_C, best_gamma = _middle(results, bestres)
    elif ((bestclf.estimator.C in (_search_range(
            C, step)[0], _search_range(C, step)[-1]) or bestclf.estimator.gamma
           in (_search_range(gamma, step)[0], _search_range(gamma, step)[-1]))
          and collections.Counter(results.values())[bestres] == 1):
        logging.warn('optimal at border. c:%f, g:%f, score: %f',
                     bestclf.estimator.C, bestclf.estimator.gamma, bestres)
    else:
        step /= 2.
    return my_grid(X,
                   y,
                   best_C or bestclf.estimator.C,
                   best_gamma or bestclf.estimator.gamma,
                   step,
                   results,
                   previous=previous,
                   auc_bound=auc_bound)
예제 #6
0
 def get_classifier(self, probability=True):
     '''@return classifier that achieved this result'''
     return utils.clf_default(
         C=self.C, gamma=self.gamma,
         class_weight=None if self.open_world else "balanced",
         probability=probability)