def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_samples=1., num_threads=24, pct_embargo=0., scoring='accuracy', method='SFI', min_w_leaf=0., **kwargs): n_jobs = (-1 if num_threads > 1 else 1) # Build classifiers if clf is None: base_clf = DecisionTreeClassifier(criterion='entropy', max_features=1, class_weight='balanced', min_weight_fraction_leaf=min_w_leaf) clf = BaggingClassifier(base_estimator=base_clf, n_estimators=n_estimators, max_features=1., max_samples=max_samples, oob_score=True, n_jobs=n_jobs) fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values) if hasattr(fit_clf, 'oob_score_'): oob = fit_clf.oob_score_ else: oob = None if method == 'MDI': imp = feat_imp_MDI(fit_clf, feat_names=X.columns) oos = cv_score(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring).mean() elif method == 'MDA': imp, oos = feat_imp_MDA(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring) elif method == 'SFI': cv_gen = PurgedKFold(n_splits=n_splits, t1=cont['t1'], pct_embargo=pct_embargo) oos = cv_score(clf, X=X, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen) clf.n_jobs = 1 imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns), num_threads, clf=clf, X=X, cont=cont, scoring=scoring, cv_gen=cv_gen) return imp, oob, oos
def main(options): logger = configure_logging(options['dataset_name']) logger.info( "==> Loading training, validation, and test set from {}, {}, and {}". format(options['training_set'], options['valid_set'], options['test_set'])) train, valid, test = load_atk_train_valid_test(options['training_set'], options['valid_set'], options['test_set']) logger.info( "- Shape of the training set: number of instances = {}; number of features = {} ({} is the label)" .format(train.shape[0], train.shape[1] - 1, train.shape[1])) logger.info( "- Shape of the validation set: number of instances = {}; number of features = {} ({} is the label)" .format(valid.shape[0], valid.shape[1] - 1, valid.shape[1])) logger.info( "- Shape of the test set: number of instances = {}; number of features = {} ({} is the label)" .format(test.shape[0], test.shape[1] - 1, test.shape[1])) logger.info("==> Extract column names and numerical features...") # column names colnames = train.columns.tolist() logger.info("==> Loading attack rules from {}".format( options['attack_rules_filename'])) attack_rules = rf.load_attack_rules(options['attack_rules_filename'], colnames) logger.info("==> Create the corresponding attacker...") attacker = rf.Attacker(attack_rules, options['attacker_budget']) logger.info( "==> Extract feature matrix from {} instances of training set".format( options['n_instances'])) X_train = train.iloc[:, :-1].values[:options['n_instances']] logger.info( "==> Extract label vector from {} instances of training set".format( options['n_instances'])) y_train = train.iloc[:, -1].replace(-1, 0).values[:options['n_instances']] attacker.attack_dataset(X_train, attacks_filename='{}_B{}.atks'.format( options['attacks_filename'], str(options['attacker_budget']))) feature_blacklist = {} if options['exclude_features']: logger.info( "==> Excluding the following features from training: [{}]".format( ", ".join([f for f in options['exclude_features']]))) feature_blacklist_names = options['exclude_features'] feature_blacklist_ids = [ colnames.index(fb) for fb in feature_blacklist_names ] feature_blacklist = dict( zip(feature_blacklist_ids, feature_blacklist_names)) dataset_name = options['attacks_filename'].split('/')[-1].split('_')[0] output_model_filename = build_output_model_filename( dataset_name, options['model_type'], options['loss_function'], options['n_estimators'], options['max_depth'], options['instances_per_node'], options['attacker_budget']) partial_output_model_filename = output_model_filename.split('.')[0] # changes implemented here in regard to optimizer options logger.info( "==> Create the split optimizer which will be used for this training..." ) if options['loss_function'] == 'sse': optimizer = rf.SplitOptimizer( split_function_name=options['loss_function']) if options['loss_function'] == 'logloss': optimizer = rf.SplitOptimizer( split_function_name=options['loss_function']) #logger.info("==> Create the split optimizer which will be used for this training...") if options['model_type'] == 'robust': logger.info("==> Training \"{}\" random forest...".format( options['model_type'])) # base robust tree rdt = rf.RobustDecisionTree( 0, attacker=attacker, split_optimizer=optimizer, max_depth=options['max_depth'], min_instances_per_node=options['instances_per_node'], max_samples=options['bootstrap_samples'] / 100.0, max_features=options['bootstrap_features'] / 100.0, feature_blacklist=feature_blacklist) # create the robust forest rrf = rf.RobustForest(0, base_estimator=rdt, n_estimators=options['n_estimators']) rrf.fit(X_train, y=y_train, dump_filename=options['output_dirname'] + '/' + partial_output_model_filename, dump_n_trees=10) logger.info( "==> Eventually, serialize the \"{}\" random forest just trained to {}" .format(options['model_type'], options['output_dirname'] + '/' + output_model_filename)) rrf.save(options['output_dirname'] + '/' + output_model_filename) if options['model_type'] == 'par-robust': from sklearn.ensemble import BaggingClassifier logger.info("training \"{}\" treant".format(options['model_type'])) import time start_time = time.time() # base robust tree rdt = rf.RobustDecisionTree( 0, attacker=attacker, split_optimizer=optimizer, max_depth=options['max_depth'], min_instances_per_node=options['instances_per_node'], max_samples=options['bootstrap_samples'] / 100.0, max_features=options['bootstrap_features'] / 100.0, feature_blacklist=feature_blacklist) bagging = BaggingClassifier(base_estimator=rdt, n_estimators=options['n_estimators'], max_features=1.0, max_samples=1.0, bootstrap=False, bootstrap_features=False, n_jobs=options['jobs']) bagging.fit(X_train, y_train) # do some cleaning and prepare to evaluation bagging.n_jobs = None bagging.base_estimator_.clean_after_training() save(bagging, options['output_dirname'] + '/' + output_model_filename, options['n_estimators']) print("--- %s seconds ---" % (time.time() - start_time)) if options['model_type'] == 'icml2019': logger.info( "==> Create the split optimizer which will be used for this training..." ) icml_optimizer = rf.SplitOptimizer( split_function_name=options['loss_function'], icml2019=True) logger.info("training \"{}\" treant".format(options['model_type'])) from sklearn.ensemble import BaggingClassifier # base robust tree rdt = rf.RobustDecisionTree( 0, attacker=attacker, split_optimizer=icml_optimizer, max_depth=options['max_depth'], min_instances_per_node=options['instances_per_node'], max_samples=options['bootstrap_samples'] / 100.0, max_features=options['bootstrap_features'] / 100.0, feature_blacklist=feature_blacklist, affine=False) bagging = BaggingClassifier(base_estimator=rdt, n_estimators=options['n_estimators'], max_features=1.0, max_samples=1.0, bootstrap=False, bootstrap_features=False, n_jobs=options['jobs']) bagging.fit(X_train, y_train) # do some cleaning and prepare to evaluation bagging.n_jobs = None bagging.base_estimator_.clean_after_training() save(bagging, options['output_dirname'] + '/' + output_model_filename, options['n_estimators'])