def _influence_method(X_test, args, model, X_train, y_train, y_test, logger): model_path = '.model.json' model.save_model(model_path, format='json') if args.inf_k == -1: update_set = 'AllPoints' elif args.inf_k == 0: update_set = 'SinglePoint' else: update_set = 'TopKLeaves' explainer = CBLeafInfluenceEnsemble(model_path, X_train, y_train, k=args.inf_k, learning_rate=model.learning_rate_, update_set=update_set) contributions_sum = np.zeros(X_train.shape[0]) for i in tqdm.tqdm(range(X_test.shape[0])): contributions = [] buf = deepcopy(explainer) for j in tqdm.tqdm(range(len(X_train))): explainer.fit(removed_point_idx=j, destination_model=buf) contributions.append(buf.loss_derivative(X_test[[i]], y_test[[i]])[0]) contributions = np.array(contributions) contributions_sum += contributions # sort by descending order; the most positive train instances # are the ones that decrease the log loss the most, and are the most helpful train_order = np.argsort(contributions_sum)[::-1] return train_order
def get_influence_explainer(model, X_train, y_train, inf_k): """ Returns a CBLeafInfluenceEnsemble explainer. Parameters ---------- model : object Learned CatBoost tree ensemble. X_train : 2d array-like Train data. y_train : 1d array-like Train labels. k : int Number of leaves to use in explainer. """ model_path = '.model.json' model.save_model(model_path, format='json') if inf_k == -1: update_set = 'AllPoints' elif inf_k == 0: update_set = 'SinglePoint' else: update_set = 'TopKLeaves' leaf_influence = CBLeafInfluenceEnsemble( model_path, X_train, y_train, learning_rate=model.learning_rate_, update_set=update_set, k=inf_k) return leaf_influence
def influence_method(args, model, X_train, y_train, X_test, y_test, logger=None, k=-1, update_set='AllPoints', frac_progress_update=0.1): """ Sort training instances based on their Leaf Influence on the test set. Reference: https://github.com/kohpangwei/influence-release/blob/master/influence/experiments.py """ # LeafInfluence settings if 'fast' in args.method: k = 0 update_set = 'SinglePoint' assert args.model == 'cb', 'tree-ensemble is not a CatBoost model!' # save CatBoost model temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4()))) temp_fp = os.path.join(temp_dir, 'cb.json') os.makedirs(temp_dir, exist_ok=True) model.save_model(temp_fp, format='json') # initialize Leaf Influence explainer = CBLeafInfluenceEnsemble(temp_fp, X_train, y_train, k=k, learning_rate=model.learning_rate_, update_set=update_set) # display status if logger: logger.info('\ncomputing influence of each training sample on the test set...') # contributions container start = time.time() attributions = np.zeros((X_train.shape[0], X_test.shape[0])) # compute influence on each test instance buf = deepcopy(explainer) for i in range(X_train.shape[0]): explainer.fit(removed_point_idx=i, destination_model=buf) # compute influence for each training instance for j in range(X_test.shape[0]): attributions[i][j] = buf.loss_derivative(X_test[[j]], y_test[[j]])[0] # display progress if logger and i % int(X_train.shape[0] * frac_progress_update) == 0: elapsed = time.time() - start train_frac_complete = i / X_train.shape[0] * 100 logger.info('train {:.1f}%...{:.3f}s'.format(train_frac_complete, elapsed)) # aggregate influences attributions_sum = np.sum(attributions, axis=0) # sort by descending order; the most positive train instances # are the ones that decrease the log loss the most, and are the most helpful train_indices = np.argsort(attributions_sum)[::-1] # clean up shutil.rmtree(temp_dir) return train_indices
def leaf_influence_method(args, model, test_ndx, X_train, y_train, X_test, y_test, k=-1, update_set='AllPoints', frac_progress_update=0.1, logger=None): """ Computes the influence on each test instance if train instance i were upweighted/removed. NOTE: This uses the FastLeafInfluence (k=0) method by Sharchilev et al. NOTE: requires the label for the test instance. """ # LeafInfluence settings if 'fast' in args.method: k = 0 update_set = 'SinglePoint' # initialize Leaf Influence start = time.time() # save CatBoost model temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4()))) temp_fp = os.path.join(temp_dir, 'cb.json') os.makedirs(temp_dir, exist_ok=True) model.save_model(temp_fp, format='json') explainer = CBLeafInfluenceEnsemble(temp_fp, X_train, y_train, k=k, learning_rate=model.learning_rate_, update_set=update_set) train_time = time.time() - start if logger: logger.info( '\ncomputing influence of each training instance on the test loss...' ) # compute influence of each training instance on the test instance with timeout(seconds=args.max_time): try: start = time.time() contributions = [] buf = deepcopy(explainer) for i in range(X_train.shape[0]): explainer.fit(removed_point_idx=i, destination_model=buf) contributions.append( buf.loss_derivative(X_test[test_ndx], y_test[test_ndx])[0]) # display progress if logger and i % int( X_train.shape[0] * frac_progress_update) == 0: elapsed = time.time() - start train_frac_complete = i / X_train.shape[0] * 100 logger.info('Train {:.1f}%...{:.3f}s'.format( train_frac_complete, elapsed)) contributions = np.array(contributions) test_time = time.time() - start except TimeoutError: if logger: logger.info('Leaf Influence test time exceeded!') exit(0) # clean up shutil.rmtree(temp_dir) # result object result = {'train_time': train_time, 'test_time': test_time} return result
def leaf_influence_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None, k=-1, update_set='AllPoints', out_dir='.', frac_progress_update=0.1): """ Computes the influence on train instance i if train instance i were upweighted/removed. This uses the FastLeafInfluence method by Sharchilev et al. Reference: https://github.com/kohpangwei/influence-release/blob/master/influence/experiments.py """ assert args.model == 'cb', 'tree-ensemble is not a CatBoost model!' # LeafInfluence settings if 'fast' in args.method: k = 0 update_set = 'SinglePoint' # save CatBoost model temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4()))) temp_fp = os.path.join(temp_dir, 'cb.json') os.makedirs(temp_dir, exist_ok=True) model_noisy.save_model(temp_fp, format='json') # initialize explainer explainer = CBLeafInfluenceEnsemble(temp_fp, X_train, y_train_noisy, k=k, learning_rate=model_noisy.learning_rate_, update_set=update_set) # display progress if logger: logger.info('\ncomputing self-influence of training instances...') start = time.time() # score container influence_scores = [] # compute self-influence score for each training instance buf = deepcopy(explainer) for i in range(X_train.shape[0]): explainer.fit(removed_point_idx=i, destination_model=buf) influence_scores.append(buf.loss_derivative(X_train[[i]], y_train_noisy[[i]])[0]) # display progress if logger and i % int(X_train.shape[0] * frac_progress_update) == 0: elapsed = time.time() - start logger.info('finished {:.1f}% train instances...{:.3f}s'.format((i / X_train.shape[0]) * 100, elapsed)) # convert scores to a numpy array influence_scores = np.array(influence_scores) # sort by ascending order; the most negative train instances # are the ones that increase the log loss the most, and are the most harmful train_indices = np.argsort(influence_scores) result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # clean up shutil.rmtree(temp_dir) return result
def experiment(args, logger, out_dir, seed): """ Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train instances using various methods, and computes how effective each method is at cleaning the data. """ # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir) # reduce train size if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] data = X_train, y_train, X_test, y_test logger.info('no. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # add noise y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed) noisy_ndx = np.array(sorted(noisy_ndx)) logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx))) # train a tree ensemble on the clean and noisy labels model = clone(clf).fit(X_train, y_train) model_noisy = clone(clf).fit(X_train, y_train_noisy) # show model performance before and after noise logger.info('\nBefore noise:') model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger) logger.info('\nAfter noise:') model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger) # check accuracy before and after noise acc_test_clean = accuracy_score(y_test, model.predict(X_test)) acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test)) # find how many corrupted/non-corrupted labels were incorrectly predicted if not args.true_label: logger.info('\nUsing predicted labels:') predicted_labels = model_noisy.predict(X_train).flatten() incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0] incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx) logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0])) logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0])) # number of checkpoints to record n_check = int(len(y_train) * args.check_pct) interval = (n_check / len(y_train)) / args.n_plot_points # random method logger.info('\nordering by random...') start = time.time() ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval, to_check=n_check, random_state=seed) check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'random.npy'), random_res) # save global lines np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean) np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct) # tree loss method logger.info('\nordering by tree loss...') start = time.time() y_train_proba = model_noisy.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res) # trex method if args.trex: logger.info('\nordering by TREX...') start = time.time() explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy, tree_kernel=args.tree_kernel, random_state=seed, true_label=args.true_label, kernel_model=args.kernel_model, verbose=args.verbose, val_frac=args.val_frac, logger=logger) ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval) check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), trex_res) # trex loss method logger.info('\nordering by TREX loss...') start = time.time() y_train_proba = explainer.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res) # influence method if args.tree_type == 'cb' and args.inf_k is not None: logger.info('\nordering by leafinfluence...') start = time.time() model_path = '.model.json' model_noisy.save_model(model_path, format='json') if args.inf_k == -1: update_set = 'AllPoints' elif args.inf_k == 0: update_set = 'SinglePoint' else: update_set = 'TopKLeaves' leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k, learning_rate=model.learning_rate_, update_set=update_set) ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy, interval, to_check=n_check) _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res) # MAPLE method if args.maple: logger.info('\nordering by MAPLE...') start = time.time() train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train) maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False) ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval, to_check=n_check) _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), maple_res) # TEKNN method if args.teknn: logger.info('\nordering by teknn...') start = time.time() # transform the data extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel) X_train_alt = extractor.fit_transform(X_train) train_label = y_train if args.true_label else model_noisy.predict(X_train) # tune and train teknn knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check) _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), teknn_res) # TEKNN loss method logger.info('\nordering by teknn loss...') start = time.time() y_train_proba = knn_clf.predict_proba(X_train_alt) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res) # MMD-Critic method if args.mmd: logger.info('\nordering by mmd-critic...') start = time.time() ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), mmd_res) # Prototype method if args.proto: logger.info('\nordering by proto...') start = time.time() ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), proto_res)