def __init__(self, base_estimators=None, bagging_base=None, stacking='xgb', features_stack=None, bagging_stack=None, hunting=False, transform=True, transform_pred=True): """blablabla Parameters ---------- base_estimators : dict('clf': classifier OR keyword-parameters) Contains all the level-0 classifiers. The key is the name of the classifier and the value is either a **prefitted** classifier or a dictionary containing the keyword arguments to instantiate such a classifier. If no pre-trained classifier is provided, the key-value has to be one of the following: - **'xgb'** creates an XGBoost classifier - **'rdf'** creates a Random Forest classifier - **'erf'** creates a Forest consisting of Extra Randomized Trees - **'nn'** creates an artificial Neural Network from TheaNets - **'ada'** creates an AdaBoost instance with Decision Trees as basis - **'gb'** creates a Gradient Boosted classifier with Decision Trees as basis """ if base_estimators is None: OrderedDict(self.__DEFAULT_CLF_CFG) else: self._base_estimators = base_estimators if isinstance(stacking, str): self._clf_1 = {stacking: None} elif isinstance(stacking, dict): self._clf_1 = stacking elif stacking in (False, None): stacking = False else: self._clf_1 = {'clf_stacking': stacking} # stacking is a classifier self._transform_data = transform self._bagging = bagging_base self._hunting = hunting self._clf_1_bagging = bagging_stack self._features_stack = features_stack self._clf_0 = {} self._factory = ClassifiersFactory() self._base_scaler = None self._pred_scaler = None
def _check_classification_report(n_classes=2): classifiers = ClassifiersFactory() classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10)) classifiers.add_classifier('rf', RandomForestClassifier()) classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) classifiers.fit(X, y) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) test_lds = LabeledDataStorage(X, y, sample_weight=None) report = classifiers.test_on_lds(test_lds) val = numpy.mean(X['column0']) labels_dict = None if n_classes > 2: labels_dict = {i: str(i) for i in range(n_classes)} _classification_mask_report(report, "column0 > %f" % val, X, labels_dict) _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict) _classification_mask_report(report, None, X, labels_dict) check_classification_learning_curve_masks(report, n_classes=n_classes)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) check_report_with_mask(report, "column0 > %f" % (val / 2.), X) check_report_with_mask(report, lambda x: numpy.array(x['column0']) < val * 2., X) check_report_with_mask(report, None, X)
def _test_classification_report(n_classes=2): classifiers = ClassifiersFactory() classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10)) classifiers.add_classifier('rf', RandomForestClassifier()) classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) classifiers.fit(X, y) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) test_lds = LabeledDataStorage(X, y, sample_weight=None) report = classifiers.test_on_lds(test_lds) val = numpy.mean(X['column0']) labels_dict = None if n_classes > 2: labels_dict = {} for i in range(n_classes): labels_dict[i] = str(i) _classification_mask_report(report, "column0 > %f" % val, X, labels_dict) _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict) _classification_mask_report(report, None, X, labels_dict)
class Mayou(Classifier): """Classifier for raredecay analysis""" __DEFAULT_CLF_CFG = dict( xgb=dict( n_estimators=450, eta=0.1, subsample=0.9, bagging=None ), rdf=dict( n_estimators=1600, # 1600 max_features='auto', # only 1 feature seems to be pretty good... max_depth=200, min_samples_split=250, min_samples_leaf=150, min_weight_fraction_leaf=0., max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=7, class_weight=None, bagging=None ), nn=dict( layers=[100, 100], hidden_activation='logistic', output_activation='linear', input_noise=0, # [0,1,2,3,4,5,10,20], hidden_noise=0, input_dropout=0, hidden_dropout=0.05, decode_from=1, weight_l1=0.01, weight_l2=0.03, scaler='standard', trainers=[{'optimize': 'adagrad', 'patience': 2, 'momentum': 0.5, 'nesterov': True, 'learning_rate': 0.2, 'min_improvement': 0.01}], bagging=None ), gb=dict( learning_rate=0.05, n_estimators=500, max_depth=4, min_samples_split=600, min_samples_leaf=1, min_weight_fraction_leaf=0., subsample=0.8, max_features=None, max_leaf_nodes=None, bagging=None ), ) __DEFAULT_BAG_CFG = dict( n_estimators=20, max_samples=0.9, max_features=1.0, ) def __init__(self, base_estimators=None, bagging_base=None, stacking='xgb', features_stack=None, bagging_stack=None, hunting=False, transform=True, transform_pred=True): """blablabla Parameters ---------- base_estimators : dict('clf': classifier OR keyword-parameters) Contains all the level-0 classifiers. The key is the name of the classifier and the value is either a **prefitted** classifier or a dictionary containing the keyword arguments to instantiate such a classifier. If no pre-trained classifier is provided, the key-value has to be one of the following: - **'xgb'** creates an XGBoost classifier - **'rdf'** creates a Random Forest classifier - **'erf'** creates a Forest consisting of Extra Randomized Trees - **'nn'** creates an artificial Neural Network from TheaNets - **'ada'** creates an AdaBoost instance with Decision Trees as basis - **'gb'** creates a Gradient Boosted classifier with Decision Trees as basis """ if base_estimators is None: OrderedDict(self.__DEFAULT_CLF_CFG) else: self._base_estimators = base_estimators if isinstance(stacking, str): self._clf_1 = {stacking: None} elif isinstance(stacking, dict): self._clf_1 = stacking elif stacking in (False, None): stacking = False else: self._clf_1 = {'clf_stacking': stacking} # stacking is a classifier self._transform_data = transform self._bagging = bagging_base self._hunting = hunting self._clf_1_bagging = bagging_stack self._features_stack = features_stack self._clf_0 = {} self._factory = ClassifiersFactory() self._base_scaler = None self._pred_scaler = None def get_params(self, deep=True): out = dict( base_estimators=None, bagging_base=None, stacking='xgb', features_stack=None, bagging_stack=None, hunting=False ) return out def _transform(self, X, fit=False): if self._transform_data: columns = copy.deepcopy(X.keys()) index = copy.deepcopy(X.index) if fit: self._base_scaler = StandardScaler(copy=True) self._base_scaler.fit_transform(X) else: X = self._base_scaler.transform(X) X = pd.DataFrame(X, index=index, columns=columns) return X def _transform_pred(self, X, fit=False): if self._transform_pred: columns = copy.deepcopy(X.keys()) index = copy.deepcopy(X.index) if fit: self._pred_scaler = StandardScaler(copy=True) # don't change data! self._pred_scaler.fit_transform(X) else: X = self._pred_scaler.transform(X) X = pd.DataFrame(X, index=index, columns=columns) return X def _make_clf(self, clf, bagging=None): """Creates a classifier from a dict or returns the clf""" if isinstance(clf, dict): key, val = clf.popitem() try: val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val except KeyError: logger.error(str(val) + " not an implemented classifier.") raise temp_bagging = val.pop('bagging', bagging) bagging = temp_bagging if bagging is None else bagging if key == 'rdf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(RandomForestClassifier(**config_clf)) elif key == 'erf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(ExtraTreesClassifier(**config_clf)) elif key == 'nn': config_clf = dict(val) # possible multi-threading arguments clf = TheanetsClassifier(**config_clf) elif key == 'ada': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(AdaBoostClassifier(**config_clf)) elif key == 'gb': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(GradientBoostingClassifier(**config_clf)) elif key == 'xgb': config_clf = dict(val) # possible multi-threading arguments clf = XGBoostClassifier(**config_clf) elif hasattr(clf, 'fit'): bagging = False # return the classifier # bagging over the instantiated estimators if isinstance(bagging, int) and bagging >= 1: bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging) if isinstance(bagging, dict): # TODO: implement multi-thread: bagging.update({'base_estimator': clf}) clf = SklearnClassifier(BaggingClassifier(**bagging)) else: raise ValueError(str(clf) + " not valid as a classifier.") clf = {key: clf} return clf def _factory_fit(self, X, y, sample_weight): # create classifiers from initial dictionary if self._base_estimators != {}: for key, val in self._base_estimators.items(): clf = self._make_clf({key: val}, bagging=self._bagging) self._clf_0.update(clf) self._base_estimators = {} # add base estimators to factory for key, val in self._clf_0.iteritems(): self._factory.add_classifier(key, val) # parallel on factory level -> good mixture of clfs (one uses lot of RAM, one cpu...) parallel_profile = 'threads-' + str(min([len(self._factory.items()), globals_.free_cpus()])) # fit all classifiers print "start fitting factory" self._factory.fit(X, y, sample_weight, parallel_profile=parallel_profile) return self def _factory_predict(self, X): columns = copy.deepcopy(X.keys()) index = copy.deepcopy(X.index) # parallel on factory level -> good mixture of clfs (one uses lot of RAM, one cpu...) parallel_profile = 'threads-' + str(min([len(self._factory.items()), globals_.free_cpus()])) # predict, return a dictionary predictions = self._factory.predict(X, parallel_profile=parallel_profile) # slice the arrays of predictions in the dict right for key, val in predictions.items(): predictions[key] = val[:, 1] return pd.DataFrame(predictions, index=index, columns=columns) # @profile def _factory_predict_proba(self, X): index = X.index # parallel on factory level -> good mixture of clfs (one uses lot of RAM, one cpu...) parallel_profile = 'threads-' + str(min([len(self._factory.items()), globals_.free_cpus()])) print parallel_profile parallel_profile = None # predict, return a dictionary predictions = self._factory.predict_proba(X, parallel_profile=parallel_profile) # slice the arrays of predictions in the dict right for key, val in predictions.items(): predictions[key] = val[:, 1] return pd.DataFrame(predictions, index=index) def _get_X_stack(self, X, fit_scaler=False): # get the predictions of the base estimators lvl_0_proba = pd.DataFrame(self._factory_predict_proba(X), index=X.index, columns=self._factory.keys()) lvl_0_proba = self._transform_pred(lvl_0_proba, fit=fit_scaler) # add data features to stacking data if self._features_stack is not None: if self._features_stack == 'all': self._features_stack = self.features elif not set(self._features_stack).issubset(self.features): raise RuntimeError("Stacked features not in features of the data fitted to") X_data = pd.DataFrame(X, columns=self._features_stack) lvl_0_proba = pd.concat([lvl_0_proba, X_data], axis=1, copy=False) return lvl_0_proba def _clf_1_fit(self, X, y, sample_weight): X_stack = self._get_X_stack(X, fit_scaler=True) if self._clf_1 not in (False, None): if self._clf_1.values()[0] is None or isinstance(self._clf_1.values()[0], dict): self._clf_1 = self._make_clf(self._clf_1, bagging=self._clf_1_bagging) self._clf = copy.deepcopy(self._clf_1.values()[0]) self._clf.fit(X_stack, y, sample_weight) def _set_features(self, X): """Set the 'features' attribute for the classifier""" if isinstance(X, pd.DataFrame): self.features = X.columns.values else: self.features = ["Feature_" + str(i) for i in range(X.shape[1])] def fit(self, X, y, sample_weight=None): # initiate properties self._set_features(X) self.classes_ = range(len(set(y))) X = self._transform(X, fit=True) # fit the base estimators self._factory_fit(X, y, sample_weight) # fit the stacking classifier self._clf_1_fit(X, y, sample_weight) return self def predict(self, X): # TODO: inside get_X_stack: lvl_0_proba = self._factory_predict_proba(X) X = self._transform(X) X_stack = self._get_X_stack(X) return self._clf.predict(X_stack) def predict_proba(self, X): X = self._transform(X) X_stack = self._get_X_stack(X) return self._clf.predict_proba(X_stack) def test_on(self, X, y, sample_weight=None): lds = LabeledDataStorage(X, y, sample_weight) return self.test_on_lds(lds) def test_on_lds(self, lds): lds.data = self._transform(lds.data) return ClassificationReport({'Mayou clf': self}, lds) def staged_predict_proba(self, X): X = self._transform(X) X_stack = self._get_X_stack(X) try: temp_proba = self._clf.staged_predict_proba(X_stack) # TODO: change error catching except: print "error occured in mayou, staged predict proba not supported" return temp_proba def stacker_test_on_lds(self, lds): """Return report for the stacker only""" lds.data = self._get_X_stack(self._transform(lds.data)) return ClassificationReport({'Mayou stacker': self._clf}, lds) def stacker_test_on(self, X, y, sample_weight=None): """Return report for the stacker only""" lds = LabeledDataStorage(X, y, sample_weight) return self.stacker_test_on_lds(lds)
if args.verbose: fprint("Start loading input") with open(args.dir + "/train_uniform_reports.pkl", 'rb') as infile: reports = pickle.load(infile) if args.verbose: fprint("Finish loading input") from mods import config_path config_path() args.config = args.config.replace(".py", "") uconfig = getattr(__import__(args.config, fromlist="uconfig"), "uconfig") # check for subset of classifiers if len(args.classifiers) > 0: for rname, report in reports.iteritems(): est_old = report.estimators pred_old = report.prediction est_new = ClassifiersFactory() pred_new = OrderedDict() for classifier in args.classifiers: if classifier in est_old: est_new[classifier] = est_old[classifier] pred_new[classifier] = pred_old[classifier] else: raise ValueError("Requested classifier " + classifier + " not found in report " + rname) report.estimators = est_new report.prediction = pred_new # to serialize plots repplots = OrderedDict() matplots = OrderedDict()
trainX = train_data[0] testX = test_data[0] trainY = train_data[1] testY = test_data[1] trainW = {} testW = {} for iw, weight in enumerate(sorted(W)): trainW[weight] = train_data[iw + 2] testW[weight] = test_data[iw + 2] if args.verbose: fprint("Split data into train_size=" + str(uconfig.training.size) + ", test_size=" + str(uconfig.training.size)) # create classifiers classifiers = ClassifiersFactory() weights = OrderedDict() # standard bdt if "bdt" in uconfig.training.algorithms: base_grad = GradientBoostingClassifier( max_depth=uconfig.hyper.max_depth, n_estimators=uconfig.hyper.n_estimators, subsample=uconfig.hyper.subsample, learning_rate=uconfig.hyper.learning_rate, min_samples_leaf=uconfig.hyper.min_samples_leaf, ) classifiers["bdt"] = SklearnClassifier(base_grad, features=uconfig.features.train) weights["bdt"] = trainW[uconfig.training.algorithms["bdt"]]