def load_data(dataset, data_dir): if dataset == 'iris': data = load_iris() X = data['data'] y = data['target'] # make into binary classification dataset indices = np.where(y != 2)[0] X = X[indices] y = y[indices] X_train, X_test, y_train, y_test = X, X, y, y elif dataset == 'boston': data = load_boston() X = data['data'] y = data['target'] # make into binary classification dataset y = np.where(y < np.mean(y), 0, 1) X_train, X_test, y_train, y_test = X, X, y, y else: X_train, X_test, y_train, y_test = data_util.get_data( dataset, data_dir) X_train = X_train[:, :50] X_test = X_test[:, :50] return X_train, X_test, y_train, y_test
def main(args): # create output directory out_dir = os.path.join(args.out_dir, args.dataset) os.makedirs(out_dir, exist_ok=True) # create logger logger_fp = os.path.join(out_dir, 'log.txt') logger = print_util.get_logger(logger_fp) logger.info('{}'.format(args)) logger.info('\ntimestamp: {}'.format(datetime.now())) # get dataset X_train, X_test, y_train, y_test = data_util.get_data( args.dataset, args.data_dir) logger.info('X_train.shape: {}'.format(X_train.shape)) # collect top threshold scores top_scores = [] # get best threshold(s) for each feature for i in range(X_train.shape[1]): vals = np.unique(X_train[:, i]) C = get_thresholds(X_train[:, i], y_train) S = compute_scores(C) logger.info( '\n[FEATURE {}] no. unique: {:,}, no. valid thresholds: {:,}'. format(i, len(vals), len(C))) # sort thresholds based on score S = sorted(S, key=lambda x: x[1]) # display split score for each threshold for T, s in S[:args.k]: logger.info(' threshold value: {:.5f}, score: {:.5f}'.format( T.v, s)) top_scores.append(s) # plot distribution of top threshold scores ax = sns.distplot(top_scores, rug=True, hist=False) ax.set_title('{}: Scores for Top {} Threshold(s) / Feature'.format( args.dataset.title(), args.k)) ax.set_xlabel('Gini index') ax.set_ylabel('Density') plt.savefig(os.path.join(out_dir, 'k_{}.pdf'.format(args.k)), bbox_inches='tight')
def experiment(args, logger, out_dir, seed): """ Main method comparing performance of tree ensembles and svm models. """ # get model and data clf, params = _get_classifier(args) data = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir) X_train, X_test, y_train, y_test, label = data logger.info('train instances: {:,}'.format(len(X_train))) logger.info('test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # train model logger.info('\nmodel: {}, params: {}'.format(args.model, params)) if not args.no_tune: gs = GridSearchCV(clf, params, cv=args.cv, verbose=args.verbose).fit(X_train, y_train) cols = ['mean_fit_time', 'mean_test_score', 'rank_test_score'] cols += ['param_{}'.format(param) for param in params.keys()] df = pd.DataFrame(gs.cv_results_) logger.info('gridsearch results:') logger.info(df[cols].sort_values('rank_test_score')) model = gs.best_estimator_ logger.info('best params: {}'.format(gs.best_params_)) else: model = clf.fit(X_train, y_train) model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)
def experiment(args, logger, out_dir): """ Obtains data, trains model, and generates instance-attribution explanations. """ # get data X_train, X_test, y_train, y_test = data_util.get_data( args.dataset, data_dir=args.data_dir) logger.info('\nno. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # add noise y_train_noisy, noisy_indices = flip_labels(y_train, seed=args.rs, k=args.flip_frac) noisy_indices = np.array(sorted(noisy_indices)) logger.info('no. noisy labels: {:,}'.format(len(noisy_indices))) # number of checkpoints to record n_check = int(len(y_train) * args.check_frac) snapshot_interval = n_check / args.n_snapshots logger.info('no. check: {:,}'.format(n_check)) logger.info('no. snapshots: {:,}'.format(args.n_snapshots)) # experiment settings logger.info('\nrandom state: {}'.format(args.rs)) logger.info('criterion: {}'.format(args.criterion)) logger.info('n_estimators: {}'.format(args.n_estimators)) logger.info('max_depth: {}'.format(args.max_depth)) logger.info('max_features: {}\n'.format(args.max_features)) # clean model model = _get_model(args).fit(X_train, y_train) acc_clean, auc_clean, ap_clean = exp_util.performance(model, X_test, y_test, logger=logger, name='clean') # noisy model model = _get_model(args).fit(X_train, y_train_noisy) exp_util.performance(model, X_test, y_test, logger=logger, name='noisy') start = time.time() # random method if args.method == 'random': logger.info('\nOrdering by random...') # +1 to avoid choosing the same indices as the noisy labels np.random.seed(args.rs + 1) train_order = np.random.choice(len(y_train), size=n_check, replace=False) # D-DART: ordered from biggest change in prediction for each training sample on itself elif args.method == 'dart': logger.info('\nOrdering by D-DART...') start = time.time() initial_proba = model.predict_proba(X_train)[:, 1] explanation = np.zeros(shape=(X_train.shape[0], )) for i in range(X_train.shape[0]): model.delete(i) proba = model.predict_proba(X_train[[i]])[:, 1][0] explanation[i] = np.abs(proba - initial_proba[i]) if i % PRINT_COUNTER == 0: elapsed = time.time() - start logger.info( '[Influence on sample {}] cum time: {:.3f}s'.format( i, elapsed)) model.add(X_train[[i]], y_train_noisy[[i]]) train_order = np.argsort(explanation)[::-1] # D-DART loss: ordered by largest loss on training samples elif args.method == 'dart_loss': logger.info('\nOrdering by D-DART loss...') proba = model.predict_proba(X_train)[:, 1] loss = np.abs(proba - y_train_noisy) train_order = np.argsort(loss)[::-1] # save results checkpoints, fixed_indices = record_fixes(train_order[:n_check], noisy_indices, snapshot_interval) results = measure_performance(args, checkpoints, fixed_indices, noisy_indices, model, X_train, y_train, X_test, y_test, logger=logger) results['acc_clean'] = acc_clean results['auc_clean'] = auc_clean results['ap_clean'] = ap_clean np.save(os.path.join(out_dir, 'results.npy'), results) logger.info('time: {:3f}s'.format(time.time() - start))
def experiment(args, logger, out_dir, seed): # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs) X_train, X_test, y_train, y_test, label = data_util.get_data( args.dataset, random_state=args.rs, data_dir=args.data_dir) # reduce train size if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] data = X_train, y_train, X_test, y_test logger.info('train instances: {}'.format(len(X_train))) logger.info('test instances: {}'.format(len(X_test))) logger.info('no. features: {}'.format(X_train.shape[1])) logger.info('no. trees: {:,}'.format(args.n_estimators)) logger.info('max depth: {}'.format(args.max_depth)) # train a tree ensemble logger.info('fitting tree ensemble...') tree = clf.fit(X_train, y_train) if args.teknn: # transform data extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel) logger.info('transforming training data...') X_train_alt = extractor.fit_transform(X_train) logger.info('transforming test data...') X_test_alt = extractor.transform(X_test) train_label = y_train if args.true_label else tree.predict(X_train) # tune and train teknn start = time.time() logger.info('TE-KNN...') if args.k: knn_clf = KNeighborsClassifier(n_neighbors=args.k, weights='uniform') knn_clf = knn_clf.fit(X_train_alt, y_train) else: knn_clf = exp_util.tune_knn(tree, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) start = time.time() logger.info('generating predictions...') results = _get_knn_predictions(tree, knn_clf, X_test, X_test_alt, y_train, pred_size=args.pred_size, out_dir=out_dir, logger=logger) logger.info('time: {:.3f}s'.format(time.time() - start)) # save results if results: results['n_neighbors'] = knn_clf.get_params()['n_neighbors'] np.save(os.path.join(out_dir, 'tree.npy'), results['tree']) np.save(os.path.join(out_dir, 'surrogate.npy'), results['teknn']) if args.trex: start = time.time() explainer = trex.TreeExplainer(tree, X_train, y_train, tree_kernel=args.tree_kernel, kernel_model=args.kernel_model, random_state=args.rs, logger=logger, true_label=not args.true_label, val_frac=args.val_frac) start = time.time() logger.info('generating predictions...') results = _get_trex_predictions(tree, explainer, data) logger.info('time: {:.3f}s'.format(time.time() - start)) results['C'] = explainer.C # save data np.save(os.path.join(out_dir, 'tree.npy'), results['tree']) np.save( os.path.join(out_dir, 'surrogate.npy'.format(args.kernel_model)), results['trex'])
def experiment(args, logger, out_dir, seed): """ Delete as many samples in the time it takes the naive approach to delete one sample. """ # random number generator rng = np.random.default_rng(args.rs) # get data X_train, X_test, y_train, y_test = data_util.get_data(args.dataset, data_dir=args.data_dir) # dataset statistics logger.info('\ntrain instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('features: {:,}'.format(X_train.shape[1])) # experiment settings logger.info('\nrandom state: {}'.format(seed)) logger.info('criterion: {}'.format(args.criterion)) logger.info('n_estimators: {}'.format(args.n_estimators)) logger.info('max_depth: {}'.format(args.max_depth)) logger.info('topd: {}'.format(args.topd)) logger.info('k: {}'.format(args.k)) logger.info('subsample_size: {}'.format(args.subsample_size)) logger.info('n_delete: {}'.format(args.n_delete)) # train a naive model, before and after deleting 1 sample naive_avg_delete_time, naive_utility = train_naive(args, X_train, y_train, X_test, y_test, rng, logger=logger) # begin experiment begin = time.time() # amount of time given to delete as many samples as possible allotted_time = naive_avg_delete_time # result containers total_delete_time = 0 delete_types_list = [] delete_depths_list = [] delete_costs_list = [] # train target model model = get_model(args) start = time.time() model = model.fit(X_train, y_train) train_time = time.time() - start logger.info('[{}] train time: {:.3f}s'.format('model', train_time)) # evaluate predictive performance between naive and the model naive_auc, naive_acc, naive_ap = naive_utility model_auc, model_acc, model_ap = exp_util.performance(model, X_test, y_test, logger=logger, name='model') # available indices indices = np.arange(len(X_train)) # find the most damaging samples heuristically progress_str = '[{}] sample {}, sample_cost: {:,}, search time: {:3f}s, allotted: {:.3f}s, cum time: {:.3f}s' logger.info('\nDelete samples:') n_deleted = 0 while allotted_time > 0 and time.time() - begin <= args.time_limit: # adversarially select a sample out of a subset of candidate samples delete_ndx, search_time = get_delete_index(model, X_train, y_train, indices, rng) # delete the adversarially selected sample start = time.time() model.delete(delete_ndx) delete_time = time.time() - start # get deletion statistics delete_types, delete_depths, delete_costs = model.get_delete_metrics() delete_types_list.append(delete_types) delete_depths_list.append(delete_depths) delete_costs_list.append(delete_costs) sample_cost = np.sum(delete_costs) # sum over all trees model.clear_delete_metrics() # update counters allotted_time -= delete_time # available time total_delete_time += delete_time # total deletion time cum_time = time.time() - begin # total time n_deleted += 1 # progress update logger.info(progress_str.format(n_deleted, delete_ndx, sample_cost, search_time, allotted_time, cum_time)) # remove the chosen ndx from the list of available indices indices = np.setdiff1d(indices, [delete_ndx]) # estimate how many additional updates would finish in the remaining time if allotted_time > 0: average_delete_time = total_delete_time / n_deleted n_deleted += int(allotted_time) / average_delete_time # get model statistics n_nodes_avg, n_random_nodes_avg, n_greedy_nodes_avg = model.get_node_statistics() delete_types = np.concatenate(delete_types_list) delete_depths = np.concatenate(delete_depths_list) delete_costs = np.concatenate(delete_costs_list) # save model results result = model.get_params() result['naive_auc'] = naive_auc result['naive_acc'] = naive_acc result['naive_ap'] = naive_ap result['naive_avg_delete_time'] = naive_avg_delete_time result['naive_n_deleted'] = args.n_delete result['model_n_deleted'] = n_deleted result['model_train_%_deleted'] = n_deleted / len(X_train) result['model_delete_depths'] = count_depths(delete_types, delete_depths) result['model_delete_costs'] = count_costs(delete_types, delete_depths, delete_costs) result['model_auc'] = model_auc result['model_acc'] = model_acc result['model_ap'] = model_ap result['model_n_nodes_avg'] = n_nodes_avg result['model_n_random_nodes_avg'] = n_random_nodes_avg result['model_n_greedy_nodes_avg'] = n_greedy_nodes_avg result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info('\nResults:\n{}'.format(result)) np.save(os.path.join(out_dir, 'results.npy'), result) return result
def performance(args, logger): begin = time.time() # obtain data X_train, X_test, y_train, y_test = data_util.get_data( args.dataset, data_dir=args.data_dir) # dataset statistics logger.info('train instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('attributes: {:,}'.format(X_train.shape[1])) # tune on a fraction of the training data if not args.no_tune: if args.tune_frac < 1.0: sss = StratifiedShuffleSplit(n_splits=1, test_size=2, train_size=args.tune_frac, random_state=args.rs) tune_indices, _ = list(sss.split(X_train, y_train))[0] X_train_sub, y_train_sub = X_train[tune_indices], y_train[ tune_indices] logger.info('tune instances: {:,}'.format(X_train_sub.shape[0])) else: X_train_sub, y_train_sub = X_train, y_train # hyperparameter values n_estimators = [10, 50, 100, 250] max_depth = [1, 3, 5, 10, 20] param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators} # test model logger.info('\n{}'.format(args.model_type.capitalize())) start = time.time() # get model model = lgb.LGBMClassifier(num_leaves=2**10) # tune model if args.no_tune: model = model.fit(X_train, y_train) else: logger.info('param_grid: {}'.format(param_grid)) skf = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=args.rs) gs = GridSearchCV(model, param_grid, scoring=args.scoring, cv=skf, verbose=args.verbose, refit=True) gs = gs.fit(X_train_sub, y_train_sub) model = gs.best_estimator_ logger.info('best params: {}'.format(gs.best_params_)) # test model start = time.time() model = model.fit(X_train, y_train) exp_util.performance(model, X_test, y_test, name=args.model_type, logger=logger) logger.info('train time: {:.3f}s'.format(time.time() - start)) logger.info('total time: {:.3f}s'.format(time.time() - begin))
def experiment(args, logger, out_dir, seed): """ Main method that trains a tree ensemble, then compares the runtime of different methods to explain a single test instance. """ # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) data = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir) X_train, X_test, y_train, y_test, label = data logger.info('train instances: {:,}'.format(len(X_train))) logger.info('test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # train a tree ensemble model = clone(clf).fit(X_train, y_train) model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger) # randomly pick test instances to explain np.random.seed(seed) test_ndx = np.random.choice(len(y_test), size=1, replace=False) # train on predicted labels train_label = y_train if args.true_label else model.predict(X_train) # TREX if args.trex: logger.info('\nTREX...') fine_tune, test_time = _trex_method(args, model, test_ndx, X_test, X_train, y_train, seed=seed, logger=logger) logger.info('fine tune: {:.3f}s'.format(fine_tune)) logger.info('computation time: {:.3f}s'.format(test_time)) r = {'fine_tune': fine_tune, 'test_time': test_time} np.save(os.path.join(out_dir, 'method.npy'), r) # Leaf Influence if args.tree_type == 'cb' and args.inf_k is not None: logger.info('\nleafinfluence...') fine_tune, test_time = _influence_method(model, test_ndx, X_train, y_train, X_test, y_test, args.inf_k) if test_time is not None: logger.info('fine tune: {:.3f}s'.format(fine_tune)) logger.info('computation time: {:.3f}s'.format(test_time)) r = {'fine_tune': fine_tune, 'test_time': test_time} np.save(os.path.join(out_dir, 'method.npy'), r) else: logger.info('time limit reached!') if args.maple: logger.info('\nMAPLE...') fine_tune, test_time = _maple_method(model, test_ndx, X_train, train_label, X_test, y_test, dstump=args.dstump, logger=logger) if fine_tune is not None and test_time is not None: logger.info('fine tune: {:.3f}s'.format(fine_tune)) logger.info('computation time: {:.3f}s'.format(test_time)) r = {'fine_tune': fine_tune, 'test_time': test_time} np.save(os.path.join(out_dir, 'method.npy'), r) else: logger.info('time limit reached!') if args.teknn: logger.info('\nTEKNN...') fine_tune, test_time = _teknn_method(args, model, test_ndx, X_train, train_label, X_test, seed, logger=logger) if fine_tune is not None and test_time is not None: logger.info('fine tune: {:.3f}s'.format(fine_tune)) logger.info('computation time: {:.3f}s'.format(test_time)) r = {'fine_tune': fine_tune, 'test_time': test_time} np.save(os.path.join(out_dir, 'method.npy'), r) else: logger.info('time limit reached!')
def experiment(args, logger, out_dir, seed): """ Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train instances using various methods, and computes how effective each method is at cleaning the data. """ # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=1) data = data_util.get_data(args.dataset, random_state=1, data_dir=args.data_dir) X_train, X_test, y_train, y_test, label = data # use part of the train data if args.train_frac < 1.0 and args.train_frac > 0.0: n_train_samples = int(X_train.shape[0] * args.train_frac) train_indices = np.random.choice(X_train.shape[0], size=n_train_samples, replace=False) X_train, y_train = X_train[train_indices], y_train[train_indices] # use part of the test data for evaluation n_test_samples = args.n_test if args.n_test is not None else int(X_test.shape[0] * args.test_frac) np.random.seed(seed) test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] # choose new subset if test subset all contain the same label new_seed = seed while y_test_sub.sum() == len(y_test_sub) or y_test_sub.sum() == 0: np.random.seed(new_seed) new_seed += np.random.randint(MAX_SEED_INCREASE) np.random.seed(new_seed) test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] X_test = X_test_sub y_test = y_test_sub logger.info('no. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # train a tree ensemble model = clone(clf).fit(X_train, y_train) model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger) pcts = list(range(0, 100, 10)) np.save(os.path.join(out_dir, 'percentages.npy'), pcts) # random method logger.info('\nordering by random...') start = time.time() np.random.seed(seed) train_order = np.random.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=False) random_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'random.npy'), random_res) # TREX method if args.trex: logger.info('\nordering by our method...') start = time.time() train_order = _trex_method(args, model, X_test, X_train, y_train, seed, logger) trex_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), trex_res) # MAPLE method if args.maple: logger.info('\nordering by MAPLE...') start = time.time() train_order = _maple_method(X_test, args, model, X_train, y_train, logger) maple_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), maple_res) # influence method if args.tree_type == 'cb' and args.inf_k is not None: logger.info('\nordering by LeafInfluence...') start = time.time() train_order = _influence_method(X_test, args, model, X_train, y_train, y_test, logger) leafinfluence_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res) # TEKNN method if args.teknn: logger.info('\nordering by teknn...') start = time.time() train_order = _teknn_method(args, model, X_test, X_train, y_train, y_test, seed, logger) knn_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), knn_res)
def performance(args, out_dir, logger): begin = time.time() # obtain data X_train, X_test, y_train, y_test = data_util.get_data( args.dataset, data_dir=args.data_dir) # dataset statistics logger.info('train instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('attributes: {:,}'.format(X_train.shape[1])) logger.info('split criterion: {}'.format(args.criterion)) # tune on a fraction of the training data if not args.no_tune: if args.tune_frac < 1.0: sss = StratifiedShuffleSplit(n_splits=1, test_size=2, train_size=args.tune_frac, random_state=args.rs) tune_indices, _ = list(sss.split(X_train, y_train))[0] X_train_sub, y_train_sub = X_train[tune_indices], y_train[ tune_indices] logger.info('tune instances: {:,}'.format(X_train_sub.shape[0])) else: X_train_sub, y_train_sub = X_train, y_train else: X_train_sub, y_train_sub = X_train, y_train # hyperparameter values n_estimators = [10, 50, 100, 250] max_depth = [1, 3, 5, 10, 20] # set hyperparameter grid param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators} # add additional parameter for DaRE if args.model == 'dare': param_grid['k'] = [5, 10, 25, 50] # get hyperparameter names keys = list(param_grid.keys()) # test model logger.info('\n{}'.format(args.model.capitalize())) start = time.time() model = _get_model(args) # tune hyperparameters if not args.no_tune: logger.info('param_grid: {}'.format(param_grid)) # cross-validation skf = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=args.rs) gs = GridSearchCV(model, param_grid, scoring=args.scoring, cv=skf, verbose=args.verbose, refit=False) gs = gs.fit(X_train_sub, y_train_sub) best_params = _get_best_params(gs, param_grid, keys, logger, args.tol) model = _get_model_dict(args, best_params) # record time it takes to tune the model tune_time = time.time() - start # train best model start = time.time() model = model.fit(X_train, y_train) train_time = time.time() - start logger.info('train time: {:.3f}s'.format(train_time)) n_nodes, n_random, n_greedy = model.trees_[0].get_node_statistics() print( '[Tree 0] no. nodes: {:,}, no. random: {:,}, no. greedy: {:,}'.format( n_nodes, n_random, n_greedy)) print('[Tree 0] memory usage: {:,} bytes'.format( model.trees_[0].get_memory_usage())) print('[Forest] memory usage: {:,} bytes'.format(model.get_memory_usage())) print('max_rss: {:,}'.format( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) exit(0) # evaluate auc, acc, ap = exp_util.performance(model, X_test, y_test, name=args.model, logger=logger) # save results result = model.get_params() result['model'] = args.model result['bootstrap'] = args.bootstrap result['auc'] = auc result['acc'] = acc result['ap'] = ap result['train_time'] = train_time result['tune_train_time'] = tune_time + train_time result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss np.save(os.path.join(out_dir, 'results.npy'), result) logger.info('total time: {:.3f}s'.format(time.time() - begin)) logger.info('max_rss: {:,}'.format(result['max_rss']))
def experiment(args, logger, out_dir, seed): """ Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train instances using various methods, and computes how effective each method is at cleaning the data. """ # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir) # reduce train size if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] data = X_train, y_train, X_test, y_test logger.info('no. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # add noise y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed) noisy_ndx = np.array(sorted(noisy_ndx)) logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx))) # train a tree ensemble on the clean and noisy labels model = clone(clf).fit(X_train, y_train) model_noisy = clone(clf).fit(X_train, y_train_noisy) # show model performance before and after noise logger.info('\nBefore noise:') model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger) logger.info('\nAfter noise:') model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger) # check accuracy before and after noise acc_test_clean = accuracy_score(y_test, model.predict(X_test)) acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test)) # find how many corrupted/non-corrupted labels were incorrectly predicted if not args.true_label: logger.info('\nUsing predicted labels:') predicted_labels = model_noisy.predict(X_train).flatten() incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0] incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx) logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0])) logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0])) # number of checkpoints to record n_check = int(len(y_train) * args.check_pct) interval = (n_check / len(y_train)) / args.n_plot_points # random method logger.info('\nordering by random...') start = time.time() ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval, to_check=n_check, random_state=seed) check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'random.npy'), random_res) # save global lines np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean) np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct) # tree loss method logger.info('\nordering by tree loss...') start = time.time() y_train_proba = model_noisy.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res) # trex method if args.trex: logger.info('\nordering by TREX...') start = time.time() explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy, tree_kernel=args.tree_kernel, random_state=seed, true_label=args.true_label, kernel_model=args.kernel_model, verbose=args.verbose, val_frac=args.val_frac, logger=logger) ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval) check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), trex_res) # trex loss method logger.info('\nordering by TREX loss...') start = time.time() y_train_proba = explainer.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res) # influence method if args.tree_type == 'cb' and args.inf_k is not None: logger.info('\nordering by leafinfluence...') start = time.time() model_path = '.model.json' model_noisy.save_model(model_path, format='json') if args.inf_k == -1: update_set = 'AllPoints' elif args.inf_k == 0: update_set = 'SinglePoint' else: update_set = 'TopKLeaves' leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k, learning_rate=model.learning_rate_, update_set=update_set) ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy, interval, to_check=n_check) _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res) # MAPLE method if args.maple: logger.info('\nordering by MAPLE...') start = time.time() train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train) maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False) ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval, to_check=n_check) _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), maple_res) # TEKNN method if args.teknn: logger.info('\nordering by teknn...') start = time.time() # transform the data extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel) X_train_alt = extractor.fit_transform(X_train) train_label = y_train if args.true_label else model_noisy.predict(X_train) # tune and train teknn knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check) _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), teknn_res) # TEKNN loss method logger.info('\nordering by teknn loss...') start = time.time() y_train_proba = knn_clf.predict_proba(X_train_alt) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res) # MMD-Critic method if args.mmd: logger.info('\nordering by mmd-critic...') start = time.time() ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), mmd_res) # Prototype method if args.proto: logger.info('\nordering by proto...') start = time.time() ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), proto_res)
def experiment(args, logger, out_dir): """ Obtains data, trains model, and generates instance-attribution explanations. """ # get data X_train, X_test, y_train, y_test = data_util.get_data(args.dataset, data_dir=args.data_dir) # select a subset of the test data for evaluation n_test_samples = args.n_test if args.n_test is not None else int(X_test.shape[0] * args.test_frac) np.random.seed(args.rs) test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] # choose new subset if test subset all contain the same label new_seed = args.rs while y_test_sub.sum() == len(y_test_sub) or y_test_sub.sum() == 0: np.random.seed(new_seed) new_seed += np.random.randint(MAX_SEED_INCREASE) np.random.seed(new_seed) test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] X_test = X_test_sub y_test = y_test_sub # dataset statistics logger.info('\ntrain instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('features: {:,}'.format(X_train.shape[1])) # experiment settings logger.info('\nrandom state: {}'.format(args.rs)) logger.info('criterion: {}'.format(args.criterion)) logger.info('n_estimators: {}'.format(args.n_estimators)) logger.info('max_depth: {}'.format(args.max_depth)) logger.info('k: {}'.format(args.k)) logger.info('max_features: {}'.format(args.max_features)) logger.info('n_test: {}\n'.format(args.n_test)) # train target model model = _get_model(args) name = 'G-DaRE' start = time.time() model = model.fit(X_train, y_train) train_time = time.time() - start logger.info('[{}] train time: {:.3f}s'.format(name, train_time)) exp_util.performance(model, X_test, y_test, logger=logger, name=name) percentages = list(range(0, 100, 1)) start = time.time() # random method if args.method == 'random': logger.info('\nordering by random...') np.random.seed(args.rs) train_order = np.random.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=False) results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger) # G-DaRE 1: ordered from biggest sum increase in positive label confidence to least elif args.method == 'dare1': logger.info('\nordering by G-DaRE...') explanation = exp_util.explain_lite(model, X_train, y_train, X_test) train_order = np.argsort(explanation)[::-1] results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger) # G-DaRE 2: ordered by most positively influential to least positively influential elif args.method == 'dare2': logger.info('\nordering by G-DaRE 2...') explanation = exp_util.explain_lite(model, X_train, y_train, X_test, y_test=y_test) train_order = np.argsort(explanation)[::-1] results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger) # G-DaRE 3: ordered by biggest sum of absolute change in predictions elif args.method == 'dart3': logger.info('\nordering by G-DaRE 3...') explanation = exp_util.explain_lite(model, X_train, y_train, X_test, use_abs=True) train_order = np.argsort(explanation)[::-1] results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger) logger.info('time: {:3f}s'.format(time.time() - start)) results['percentage'] = percentages np.save(os.path.join(out_dir, 'results.npy'), results)
def experiment(args, logger, out_dir, seed): # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) data = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir, return_feature=True) X_train, X_test, y_train, y_test, label, feature = data logger.info('train instances: {:,}'.format(len(X_train))) logger.info('test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # train a tree ensemble and explainer tree = clone(clf).fit(X_train, y_train) model_util.performance(tree, X_train, y_train, X_test, y_test, logger=logger) original_auc = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1]) original_acc = accuracy_score(y_test, tree.predict(X_test)) # train TREX explainer = trex.TreeExplainer( tree, X_train, y_train, tree_kernel=args.tree_kernel, random_state=seed, kernel_model=args.kernel_model, kernel_model_kernel=args.kernel_model_kernel, true_label=args.true_label) # get missed test instances missed_indices = np.where(tree.predict(X_test) != y_test)[0] np.random.seed(seed) explain_indices = np.random.choice( missed_indices, replace=False, size=int(len(missed_indices) * args.sample_frac)) logger.info('no. incorrect instances: {:,}'.format(len(missed_indices))) logger.info('no. explain instances: {:,}'.format(len(explain_indices))) # compute total impact of train instances on test instances contributions = explainer.explain(X_test[explain_indices], y=y_test[explain_indices]) impact_sum = np.sum(contributions, axis=0) # get train instances that impact the predictions neg_contributors = np.where(impact_sum < 0)[0] neg_impact = impact_sum[neg_contributors] neg_contributors = neg_contributors[np.argsort(neg_impact)] # remove offending train instances in segments and measure performance aucs = [] accs = [] n_removed = [] for i in tqdm.tqdm(range(args.n_iterations + 1)): # remove these instances from the train data delete_ndx = neg_contributors[:args.n_remove * i] new_X_train = np.delete(X_train, delete_ndx, axis=0) new_y_train = np.delete(y_train, delete_ndx) tree = clone(clf).fit(new_X_train, new_y_train) aucs.append(roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])) accs.append(accuracy_score(y_test, tree.predict(X_test))) n_removed.append(args.n_remove * i) # save results result = tree.get_params() result['original_auc'] = original_auc result['original_acc'] = original_acc result['auc'] = aucs result['acc'] = accs result['n_remove'] = n_removed np.save(os.path.join(out_dir, 'results.npy'), result)
def performance(args, out_dir, logger): begin = time.time() # obtain data X_train, X_test, y_train, y_test = data_util.get_data( args.dataset, data_dir=args.data_dir) # dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) logger.info('split criterion: {}'.format(args.criterion)) logger.info('scoring: {}'.format(args.scoring)) # tune on a fraction of the training data if args.tune_frac < 1.0: sss = StratifiedShuffleSplit(n_splits=1, test_size=2, train_size=args.tune_frac, random_state=args.rs) tune_indices, _ = list(sss.split(X_train, y_train))[0] X_train_sub, y_train_sub = X_train[tune_indices], y_train[tune_indices] logger.info('tune instances: {:,}'.format(X_train_sub.shape[0])) else: X_train_sub, y_train_sub = X_train, y_train skf = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=args.rs) # train exact model start = time.time() model = _get_model(args, topd=0) exact_score = cross_val_score(model, X_train_sub, y_train_sub, scoring=args.scoring, cv=skf).mean() logger.info('\n[topd=0] CV score: {:.5f}, time: {:.3f}s'.format( exact_score, time.time() - start)) # train topd=0 model s = '[topd={}] CV score: {:.5f}, CV diff: {:.5f}, time: {:.3f}s' scores = {} best_scores = {tol: 0 for tol in args.tol} for topd in range(1, args.max_depth + 1): start = time.time() # obtain score for this topd model = _get_model(args, topd=topd) score = cross_val_score(model, X_train_sub, y_train_sub, scoring=args.scoring, cv=skf).mean() score_diff = exact_score - score scores[topd] = score end = time.time() - start logger.info(s.format(topd, score, score_diff, end)) # update best score for each tolerance for tol in args.tol: if best_scores[tol] == topd - 1 and score_diff <= tol: best_scores[tol] = topd total_time = time.time() - begin logger.info('{}, total time: {:.3f}s'.format(best_scores, total_time)) logger.info('max_rss: {:,}'.format( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) np.save(os.path.join(out_dir, 'results.npy'), best_scores)
def experiment(args, logger, out_dir, seed): # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) data = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir, return_image_id=True, test_size=args.test_size) X_train, X_test, y_train, y_test, label = data logger.info('train instances: {}'.format(len(X_train))) logger.info('test instances: {}'.format(len(X_test))) logger.info('labels: {}'.format(label)) if args.pca_components is not None: logger.info('{} to {} using PCA...'.format(X_train.shape[1], args.pca_components)) pca = PCA(args.pca_components, random_state=args.rs).fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # fit a tree ensemble and an explainer for that tree ensemble logger.info('fitting {}...'.format(args.tree_type)) tree = clone(clf).fit(X_train_pca, y_train) # show GBDT performance model_util.performance(tree, X_train_pca, y_train, X_test_pca, y_test, logger=logger) logger.info('fitting TREX...') explainer = trex.TreeExplainer(tree, X_train_pca, y_train, tree_kernel=args.tree_kernel, random_state=seed, kernel_model=args.kernel_model, val_frac=args.val_frac, verbose=args.verbose, true_label=args.true_label, cv=2, logger=logger) # pick a random test instance to explain if args.random_test: np.random.seed(seed) test_ndx = np.random.choice(y_test) # pick a random mispredicted test instance to explain else: # y_test_label = explainer.le_.transform(y_test) # test_dist = exp_util.instance_loss(tree.predict_proba(X_test_pca), y_test_label) test_dist = exp_util.instance_loss(tree.predict_proba(X_test_pca), y_test) test_dist_ndx = np.argsort(test_dist)[::-1] np.random.seed(seed) test_ndx = np.random.choice(test_dist_ndx[:50]) x_test = X_test_pca[test_ndx].reshape(1, -1) test_pred = tree.predict(x_test)[0] test_actual = y_test[test_ndx] # compute the impact of each training instance impact = explainer.explain(x_test)[0] alpha = explainer.get_weight()[0] sim = explainer.similarity(x_test)[0] # sort the training instances by impact in descending order sort_ndx = np.argsort(impact)[::-1] # matplotlib settings plt.rc('font', family='serif') plt.rc('xtick', labelsize=17) plt.rc('ytick', labelsize=17) plt.rc('axes', labelsize=22) plt.rc('axes', titlesize=22) plt.rc('legend', fontsize=18) plt.rc('legend', title_fontsize=11) plt.rc('lines', linewidth=1) plt.rc('lines', markersize=6) # matplotlib settings plt.rc('font', family='serif') plt.rc('xtick', labelsize=13) plt.rc('ytick', labelsize=13) plt.rc('axes', labelsize=13) plt.rc('axes', titlesize=13) plt.rc('legend', fontsize=11) plt.rc('legend', title_fontsize=11) plt.rc('lines', linewidth=1) plt.rc('lines', markersize=6) # inches width = 5.5 # Neurips 2020 width, height = set_size(width=width * 3, fraction=1, subplots=(1, 3)) fig, axs = plt.subplots(2, 1 + args.topk_train * 2, figsize=(width, height)) print(axs.shape) # plot the test image identifier = 'test_id{}'.format(test_ndx) _display_image(args, X_test[test_ndx], identifier=identifier, predicted=test_pred, actual=test_actual, ax=axs[0][0]) plt.setp(axs[0][0].spines.values(), color='blue') topk_train = args.topk_train if args.show_negatives else args.topk_train * 2 # show positive train images for i, train_ndx in enumerate(sort_ndx[:topk_train]): i += 1 identifier = 'train_id{}'.format(train_ndx) train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0] similarity = sim[train_ndx] if args.show_similarity else None weight = alpha[train_ndx] if args.show_weight else None plt.setp(axs[0][i].spines.values(), color='green') _display_image(args, X_train[train_ndx], ax=axs[0][i], identifier=identifier, predicted=train_pred, actual=y_train[train_ndx], similarity=similarity, weight=weight) # show negative train images if args.show_negatives: for i, train_ndx in enumerate(sort_ndx[::-1][:topk_train]): i += 1 + args.topk_train identifier = 'train_id{}'.format(train_ndx) train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0] similarity = sim[train_ndx] if args.show_similarity else None weight = alpha[train_ndx] if args.show_weight else None plt.setp(axs[0][i].spines.values(), color='red') _display_image(args, X_train[train_ndx], ax=axs[0][i], identifier=identifier, predicted=train_pred, actual=y_train[train_ndx], similarity=similarity, weight=weight) plt.savefig(os.path.join(out_dir, 'plot.pdf'), format='pdf', bbox_inches='tight') plt.show() # show highest weighted and lowest weighted samples for each class alpha_indices = np.argsort(alpha) print(alpha_indices) # plot highest negative weighted samples for i, train_ndx in enumerate(alpha_indices[:topk_train]): i += 1 identifier = 'train_id{}'.format(train_ndx) train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0] similarity = sim[train_ndx] if args.show_similarity else None weight = alpha[train_ndx] if args.show_weight else None plt.setp(axs[1][i].spines.values(), color='red') _display_image(args, X_train[train_ndx], ax=axs[1][i], identifier=identifier, predicted=train_pred, actual=y_train[train_ndx], similarity=similarity, weight=weight) # plot highest positive weighted samples for i, train_ndx in enumerate(alpha_indices[::-1][:topk_train]): i += 1 + args.topk_train identifier = 'train_id{}'.format(train_ndx) train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0] similarity = sim[train_ndx] if args.show_similarity else None weight = alpha[train_ndx] if args.show_weight else None plt.setp(axs[1][i].spines.values(), color='green') _display_image(args, X_train[train_ndx], ax=axs[1][i], identifier=identifier, predicted=train_pred, actual=y_train[train_ndx], similarity=similarity, weight=weight)
def experiment(args, logger, out_dir, seed): # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) # get original feature space data = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir, return_feature=True) X_train, X_test, y_train, y_test, label, feature = data logger.info('\ntrain instances: {}'.format(len(X_train))) logger.info('test instances: {}'.format(len(X_test))) logger.info('no. features: {}'.format(X_train.shape[1])) # filter the features to be the same as MFC18 mapping = {'NC17_EvalPart1': 'nc17_mfc18', 'MFC18_EvalPart1': 'mfc18_mfc19', 'MFC19_EvalPart1': 'mfc19_mfc20'} if args.dataset in mapping: reduced_feature = data_util.get_data(mapping[args.dataset], random_state=seed, data_dir=args.data_dir, return_feature=True)[-1] keep_ndx = align_feature(feature, reduced_feature) feature = feature[keep_ndx] X_train = X_train[:, keep_ndx] X_test = X_test[:, keep_ndx] # train a tree ensemble and explainer tree = clone(clf).fit(X_train, y_train) model_util.performance(tree, X_train, y_train, X_test, y_test) # store indexes of different subgroups train_neg = np.where(y_train == 0)[0] train_pos = np.where(y_train == 1)[0] # test_neg = np.where(y_test == 0)[0] # test_pos = np.where(y_test == 1)[0] # transform features to tree kernel space logger.info('\ntransforming features into tree kernel space') start = time.time() extractor = TreeExtractor(tree, tree_kernel=args.tree_kernel) X_train_tree = extractor.fit_transform(X_train) logger.info(' train transform time: {:.3f}s'.format(time.time() - start)) X_test_tree = extractor.transform(X_test) logger.info(' test transform time: {:.3f}s'.format(time.time() - start)) # reduce dimensionality on original and tree feature spaces logger.info('\nembed original features into a lower dimensional space') X_train, X_test = reduce_and_embed(args, X_train, X_test, logger, init='random') logger.info('\nembed tree kernel features into a lower dimensional space') X_train_tree, X_test_tree = reduce_and_embed(args, X_train_tree, X_test_tree, logger, init='pca') # separating embedded points into train and test # n_train = len(y_train) # train_neg_embed = X_embed[:n_train][train_neg] # train_pos_embed = X_embed[:n_train][train_pos] # test_neg_embed = X_embed[n_train:][test_neg] # test_pos_embed = X_embed[n_train:][test_pos] # save original feature space results np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg]) np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos]) # save tree kenel space results np.save(os.path.join(out_dir, 'train_tree_negative'), X_train_tree[train_neg]) np.save(os.path.join(out_dir, 'train_tree_positive'), X_train_tree[train_pos])