def test_clf_default(self): clf = utils.clf_default() self.assertFalse(clf.estimator.class_weight) clf = utils.clf_default([-1, 1]) self.assertFalse(clf.estimator.class_weight) clf = utils.clf_default([0, 1]) self.assertTrue(clf.estimator.class_weight)
def simulated_open_world(scenario_obj, auc_bound, binary, bg_size, current_sites): '''@return metrics for open world experiment''' if current_sites: # necessary? todo: write test before removal scenario_obj.traces = sites.clean(scenario_obj.get_traces( current_sites)) try: scenario_obj = scenario_obj.get_open_world(num=bg_size, same=True, current_sites=current_sites) except ValueError: logging.error("no fitting background set found for %r", scenario_obj) raise # if binary: # scenario_obj = scenario_obj.binarized() X, y, d = scenario_obj.get_features_cumul(current_sites) X = preprocessing.MinMaxScaler().fit_transform(X) # scaling is idempotent (clf_noprob, accuracy, _) = fit.my_grid(X, y, auc_bound=auc_bound) y_pred = model_selection.cross_val_predict( clf_noprob, X, y, cv=config.FOLDS, n_jobs=config.JOBS_NUM) confmat = metrics.confusion_matrix(y, y_pred) (tpr, fpr, tpa) = mymetrics.tpr_fpr_tpa(_binmat(confmat))[1] C = clf_noprob.estimator.C gamma = clf_noprob.estimator.gamma if binary: # can (easily) compute auroc clf = utils.clf_default(y, C=C, gamma=gamma, probability=True) y_pred = model_selection.cross_val_predict( clf, X, y, cv=config.FOLDS, n_jobs=config.JOBS_NUM, method="predict_proba") auroc = metrics.roc_auc_score( mymetrics.binarize(y, transform_to=1), mymetrics.binarize_probability(y_pred)[:, 1], max_fpr=auc_bound) else: auroc = None return (tpr, fpr, auroc, C, gamma, accuracy, y, y_pred, d)
def sci_grid(X, y, C=2**14, gamma=2**-10, step=2, scoring=None, probability=False, simple=False): '''(scikit-)grid-search on fixed params, searching laterally and in depth @param X,y,C,gamma,folds as for the classifier @param step exponential step size, c-range = [c/2**step, c, c*2**step], etc @param grid_args: arguments for grid-search, f.ex. scorer @return gridsearchcv classifier (with .best_score and .best_params) >>> test = sci_grid([[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1]], [0, 0, 0, 1, 1, 1], 0.0001, 0.000001); test.best_score_ 1.0 ''' if simple: clf = model_selection.GridSearchCV(estimator=utils.clf_default( y, probability=True), param_grid={ "estimator__C": np.logspace(-3, 3, base=2, num=7), "estimator__gamma": np.logspace(-3, 3, base=2, num=7) }, n_jobs=config.JOBS_NUM, verbose=config.VERBOSE, cv=config.FOLDS, scoring="roc_auc") clf.fit(X, y) return clf previous = [] clf = _sci_fit(C, gamma, step, X, y, scoring, probability) while not _stop(y, step, clf.best_score_, previous): logging.info('C: %s, gamma: %s, step: %s, score: %f', clf.best_params_['estimator__C'], clf.best_params_['estimator__gamma'], step, clf.best_score_) if _sci_best_at_border(clf): pass # keep step, search laterally else: step = step / 2. previous.append(clf.best_score_) clf = _sci_fit(clf.best_params_['estimator__C'], clf.best_params_['estimator__gamma'], step, X, y) return clf
def _sci_fit(C, gamma, step, X, y, scoring=None, probability=False): '''@return appropriate gridsearchcv, fitted with X and y''' cs = _search_range(C, step) gammas = _search_range(gamma, step) clf = model_selection.GridSearchCV(estimator=utils.clf_default( y, probability=probability), param_grid={ "estimator__C": cs, "estimator__gamma": gammas }, n_jobs=config.JOBS_NUM, verbose=config.VERBOSE, cv=config.FOLDS, scoring=scoring) return clf.fit(X, y)
def my_grid(X, y, C=2**4, gamma=2**-4, step=2, results=None, auc_bound=None, previous=None, folds=config.FOLDS, delta=0.01): '''@param results are previously computed results {(C,gamma): accuracy, ...} @param auc_bound if set, use the bounded auc score with this y_bound @return Result(clf, best_score_, results) (namedtuple see above)''' global scaler if not results: previous = [] results = {} scaler = None # guesstimate: one grid search per data set (TD: refact) bestclf = None bestres = np.array([0]) for c in _search_range(C, step): for g in _search_range(gamma, step): clf = utils.clf_default(y, gamma=g, C=c, probability=(True if auc_bound else False)) if (c, g) in results: current = results[(c, g)] else: if auc_bound: current = _bounded_auc_eval(X, y, clf, auc_bound) else: current = _eval(X, y, clf, folds=folds) results[(c, g)] = current if not bestclf or bestres < current: bestclf = clf bestres = current logging.info('c: %8s g: %15s res: %.6f', c, g, current.mean()) previous.append(np.mean(bestres)) if _stop(y, step, np.mean(bestres), previous, C): if collections.Counter(results.values())[bestres] > 1: logging.warn('more than 1 optimal result, "middle" clf returned') best_C, best_gamma = _middle(results, np.mean(bestres)) bestclf = utils.clf_default( y, C=best_C, gamma=best_gamma, probability=(True if auc_bound else False)) logging.info('grid result: %s', bestclf) return Result(bestclf, np.mean(bestres), results) best_C = best_gamma = None if collections.Counter(results.values())[bestres] > 1: logging.warn("more than 1 optimal result") best_C, best_gamma = _middle(results, bestres) elif ((bestclf.estimator.C in (_search_range( C, step)[0], _search_range(C, step)[-1]) or bestclf.estimator.gamma in (_search_range(gamma, step)[0], _search_range(gamma, step)[-1])) and collections.Counter(results.values())[bestres] == 1): logging.warn('optimal at border. c:%f, g:%f, score: %f', bestclf.estimator.C, bestclf.estimator.gamma, bestres) else: step /= 2. return my_grid(X, y, best_C or bestclf.estimator.C, best_gamma or bestclf.estimator.gamma, step, results, previous=previous, auc_bound=auc_bound)
def get_classifier(self, probability=True): '''@return classifier that achieved this result''' return utils.clf_default( C=self.C, gamma=self.gamma, class_weight=None if self.open_world else "balanced", probability=probability)