Exemple #1
0
def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_samples=1.,
                    num_threads=24, pct_embargo=0., scoring='accuracy',
                    method='SFI', min_w_leaf=0., **kwargs):
    n_jobs = (-1 if num_threads > 1 else 1)
    # Build classifiers
    if clf is None:
        base_clf = DecisionTreeClassifier(criterion='entropy', max_features=1,
                                          class_weight='balanced',
                                          min_weight_fraction_leaf=min_w_leaf)
        clf = BaggingClassifier(base_estimator=base_clf, n_estimators=n_estimators,
                                max_features=1., max_samples=max_samples,
                                oob_score=True, n_jobs=n_jobs)
    fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values)
    if hasattr(fit_clf, 'oob_score_'):
        oob = fit_clf.oob_score_
    else:
        oob = None
    if method == 'MDI':
        imp = feat_imp_MDI(fit_clf, feat_names=X.columns)
        oos = cv_score(clf, X=X, y=cont['bin'], n_splits=n_splits,
                       sample_weight=cont['w'], t1=cont['t1'],
                       pct_embargo=pct_embargo, scoring=scoring).mean()
    elif method == 'MDA':
        imp, oos = feat_imp_MDA(clf, X=X, y=cont['bin'], n_splits=n_splits,
                                sample_weight=cont['w'], t1=cont['t1'],
                                pct_embargo=pct_embargo, scoring=scoring)
    elif method == 'SFI':
        cv_gen = PurgedKFold(n_splits=n_splits, t1=cont['t1'], pct_embargo=pct_embargo)
        oos = cv_score(clf, X=X, y=cont['bin'], sample_weight=cont['w'],
                       scoring=scoring, cv_gen=cv_gen)
        clf.n_jobs = 1
        imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns),
                            num_threads, clf=clf, X=X, cont=cont,
                            scoring=scoring, cv_gen=cv_gen)
    return imp, oob, oos
def main(options):

    logger = configure_logging(options['dataset_name'])

    logger.info(
        "==> Loading training, validation, and test set from {}, {}, and {}".
        format(options['training_set'], options['valid_set'],
               options['test_set']))
    train, valid, test = load_atk_train_valid_test(options['training_set'],
                                                   options['valid_set'],
                                                   options['test_set'])

    logger.info(
        "- Shape of the training set: number of instances = {}; number of features = {} ({} is the label)"
        .format(train.shape[0], train.shape[1] - 1, train.shape[1]))

    logger.info(
        "- Shape of the validation set: number of instances = {}; number of features = {} ({} is the label)"
        .format(valid.shape[0], valid.shape[1] - 1, valid.shape[1]))

    logger.info(
        "- Shape of the test set: number of instances = {}; number of features = {} ({} is the label)"
        .format(test.shape[0], test.shape[1] - 1, test.shape[1]))

    logger.info("==> Extract column names and numerical features...")
    # column names
    colnames = train.columns.tolist()

    logger.info("==> Loading attack rules from {}".format(
        options['attack_rules_filename']))
    attack_rules = rf.load_attack_rules(options['attack_rules_filename'],
                                        colnames)
    logger.info("==> Create the corresponding attacker...")
    attacker = rf.Attacker(attack_rules, options['attacker_budget'])

    logger.info(
        "==> Extract feature matrix from {} instances of training set".format(
            options['n_instances']))
    X_train = train.iloc[:, :-1].values[:options['n_instances']]
    logger.info(
        "==> Extract label vector from {} instances of training set".format(
            options['n_instances']))
    y_train = train.iloc[:, -1].replace(-1, 0).values[:options['n_instances']]

    attacker.attack_dataset(X_train,
                            attacks_filename='{}_B{}.atks'.format(
                                options['attacks_filename'],
                                str(options['attacker_budget'])))

    feature_blacklist = {}
    if options['exclude_features']:
        logger.info(
            "==> Excluding the following features from training: [{}]".format(
                ", ".join([f for f in options['exclude_features']])))
        feature_blacklist_names = options['exclude_features']
        feature_blacklist_ids = [
            colnames.index(fb) for fb in feature_blacklist_names
        ]
        feature_blacklist = dict(
            zip(feature_blacklist_ids, feature_blacklist_names))

    dataset_name = options['attacks_filename'].split('/')[-1].split('_')[0]
    output_model_filename = build_output_model_filename(
        dataset_name, options['model_type'], options['loss_function'],
        options['n_estimators'], options['max_depth'],
        options['instances_per_node'], options['attacker_budget'])
    partial_output_model_filename = output_model_filename.split('.')[0]
    # changes implemented here in regard to optimizer options
    logger.info(
        "==> Create the split optimizer which will be used for this training..."
    )
    if options['loss_function'] == 'sse':

        optimizer = rf.SplitOptimizer(
            split_function_name=options['loss_function'])

    if options['loss_function'] == 'logloss':

        optimizer = rf.SplitOptimizer(
            split_function_name=options['loss_function'])

    #logger.info("==> Create the split optimizer which will be used for this training...")

    if options['model_type'] == 'robust':
        logger.info("==> Training \"{}\" random forest...".format(
            options['model_type']))

        # base robust tree
        rdt = rf.RobustDecisionTree(
            0,
            attacker=attacker,
            split_optimizer=optimizer,
            max_depth=options['max_depth'],
            min_instances_per_node=options['instances_per_node'],
            max_samples=options['bootstrap_samples'] / 100.0,
            max_features=options['bootstrap_features'] / 100.0,
            feature_blacklist=feature_blacklist)

        # create the robust forest
        rrf = rf.RobustForest(0,
                              base_estimator=rdt,
                              n_estimators=options['n_estimators'])
        rrf.fit(X_train,
                y=y_train,
                dump_filename=options['output_dirname'] + '/' +
                partial_output_model_filename,
                dump_n_trees=10)

        logger.info(
            "==> Eventually, serialize the \"{}\" random forest just trained to {}"
            .format(options['model_type'],
                    options['output_dirname'] + '/' + output_model_filename))
        rrf.save(options['output_dirname'] + '/' + output_model_filename)

    if options['model_type'] == 'par-robust':
        from sklearn.ensemble import BaggingClassifier
        logger.info("training \"{}\" treant".format(options['model_type']))
        import time
        start_time = time.time()
        # base robust tree
        rdt = rf.RobustDecisionTree(
            0,
            attacker=attacker,
            split_optimizer=optimizer,
            max_depth=options['max_depth'],
            min_instances_per_node=options['instances_per_node'],
            max_samples=options['bootstrap_samples'] / 100.0,
            max_features=options['bootstrap_features'] / 100.0,
            feature_blacklist=feature_blacklist)

        bagging = BaggingClassifier(base_estimator=rdt,
                                    n_estimators=options['n_estimators'],
                                    max_features=1.0,
                                    max_samples=1.0,
                                    bootstrap=False,
                                    bootstrap_features=False,
                                    n_jobs=options['jobs'])
        bagging.fit(X_train, y_train)
        # do some cleaning and prepare to evaluation

        bagging.n_jobs = None
        bagging.base_estimator_.clean_after_training()

        save(bagging, options['output_dirname'] + '/' + output_model_filename,
             options['n_estimators'])
        print("--- %s seconds ---" % (time.time() - start_time))
    if options['model_type'] == 'icml2019':

        logger.info(
            "==> Create the split optimizer which will be used for this training..."
        )
        icml_optimizer = rf.SplitOptimizer(
            split_function_name=options['loss_function'], icml2019=True)
        logger.info("training \"{}\" treant".format(options['model_type']))
        from sklearn.ensemble import BaggingClassifier

        # base robust tree
        rdt = rf.RobustDecisionTree(
            0,
            attacker=attacker,
            split_optimizer=icml_optimizer,
            max_depth=options['max_depth'],
            min_instances_per_node=options['instances_per_node'],
            max_samples=options['bootstrap_samples'] / 100.0,
            max_features=options['bootstrap_features'] / 100.0,
            feature_blacklist=feature_blacklist,
            affine=False)

        bagging = BaggingClassifier(base_estimator=rdt,
                                    n_estimators=options['n_estimators'],
                                    max_features=1.0,
                                    max_samples=1.0,
                                    bootstrap=False,
                                    bootstrap_features=False,
                                    n_jobs=options['jobs'])
        bagging.fit(X_train, y_train)
        # do some cleaning and prepare to evaluation

        bagging.n_jobs = None
        bagging.base_estimator_.clean_after_training()

        save(bagging, options['output_dirname'] + '/' + output_model_filename,
             options['n_estimators'])