예제 #1
0
def _influence_method(X_test, args, model, X_train, y_train, y_test, logger):

    model_path = '.model.json'
    model.save_model(model_path, format='json')

    if args.inf_k == -1:
        update_set = 'AllPoints'
    elif args.inf_k == 0:
        update_set = 'SinglePoint'
    else:
        update_set = 'TopKLeaves'

    explainer = CBLeafInfluenceEnsemble(model_path, X_train, y_train, k=args.inf_k,
                                        learning_rate=model.learning_rate_,
                                        update_set=update_set)

    contributions_sum = np.zeros(X_train.shape[0])
    for i in tqdm.tqdm(range(X_test.shape[0])):
        contributions = []
        buf = deepcopy(explainer)

        for j in tqdm.tqdm(range(len(X_train))):
            explainer.fit(removed_point_idx=j, destination_model=buf)
            contributions.append(buf.loss_derivative(X_test[[i]], y_test[[i]])[0])

        contributions = np.array(contributions)
        contributions_sum += contributions

    # sort by descending order; the most positive train instances
    # are the ones that decrease the log loss the most, and are the most helpful
    train_order = np.argsort(contributions_sum)[::-1]
    return train_order
예제 #2
0
def get_influence_explainer(model, X_train, y_train, inf_k):
    """
    Returns a CBLeafInfluenceEnsemble explainer.
    Parameters
    ----------
    model : object
        Learned CatBoost tree ensemble.
    X_train : 2d array-like
        Train data.
    y_train : 1d array-like
        Train labels.
    k : int
        Number of leaves to use in explainer.
    """

    model_path = '.model.json'
    model.save_model(model_path, format='json')

    if inf_k == -1:
        update_set = 'AllPoints'
    elif inf_k == 0:
        update_set = 'SinglePoint'
    else:
        update_set = 'TopKLeaves'

    leaf_influence = CBLeafInfluenceEnsemble(
        model_path,
        X_train,
        y_train,
        learning_rate=model.learning_rate_,
        update_set=update_set,
        k=inf_k)

    return leaf_influence
예제 #3
0
파일: roar.py 프로젝트: jjbrophy47/trex
def influence_method(args, model, X_train, y_train, X_test, y_test, logger=None,
                     k=-1, update_set='AllPoints', frac_progress_update=0.1):
    """
    Sort training instances based on their Leaf Influence on the test set.

    Reference:
    https://github.com/kohpangwei/influence-release/blob/master/influence/experiments.py
    """

    # LeafInfluence settings
    if 'fast' in args.method:
        k = 0
        update_set = 'SinglePoint'

    assert args.model == 'cb', 'tree-ensemble is not a CatBoost model!'

    # save CatBoost model
    temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4())))
    temp_fp = os.path.join(temp_dir, 'cb.json')
    os.makedirs(temp_dir, exist_ok=True)
    model.save_model(temp_fp, format='json')

    # initialize Leaf Influence
    explainer = CBLeafInfluenceEnsemble(temp_fp,
                                        X_train,
                                        y_train,
                                        k=k,
                                        learning_rate=model.learning_rate_,
                                        update_set=update_set)

    # display status
    if logger:
        logger.info('\ncomputing influence of each training sample on the test set...')

    # contributions container
    start = time.time()
    attributions = np.zeros((X_train.shape[0], X_test.shape[0]))

    # compute influence on each test instance
    buf = deepcopy(explainer)
    for i in range(X_train.shape[0]):
        explainer.fit(removed_point_idx=i, destination_model=buf)

        # compute influence for each training instance
        for j in range(X_test.shape[0]):
            attributions[i][j] = buf.loss_derivative(X_test[[j]], y_test[[j]])[0]

        # display progress
        if logger and i % int(X_train.shape[0] * frac_progress_update) == 0:
            elapsed = time.time() - start
            train_frac_complete = i / X_train.shape[0] * 100
            logger.info('train {:.1f}%...{:.3f}s'.format(train_frac_complete, elapsed))

    # aggregate influences
    attributions_sum = np.sum(attributions, axis=0)

    # sort by descending order; the most positive train instances
    # are the ones that decrease the log loss the most, and are the most helpful
    train_indices = np.argsort(attributions_sum)[::-1]

    # clean up
    shutil.rmtree(temp_dir)

    return train_indices
예제 #4
0
파일: runtime.py 프로젝트: jjbrophy47/trex
def leaf_influence_method(args,
                          model,
                          test_ndx,
                          X_train,
                          y_train,
                          X_test,
                          y_test,
                          k=-1,
                          update_set='AllPoints',
                          frac_progress_update=0.1,
                          logger=None):
    """
    Computes the influence on each test instance if train
    instance i were upweighted/removed.

    NOTE: This uses the FastLeafInfluence (k=0) method by Sharchilev et al.
    NOTE: requires the label for the test instance.
    """

    # LeafInfluence settings
    if 'fast' in args.method:
        k = 0
        update_set = 'SinglePoint'

    # initialize Leaf Influence
    start = time.time()

    # save CatBoost model
    temp_dir = os.path.join('.catboost_info',
                            'leaf_influence_{}'.format(str(uuid.uuid4())))
    temp_fp = os.path.join(temp_dir, 'cb.json')
    os.makedirs(temp_dir, exist_ok=True)
    model.save_model(temp_fp, format='json')

    explainer = CBLeafInfluenceEnsemble(temp_fp,
                                        X_train,
                                        y_train,
                                        k=k,
                                        learning_rate=model.learning_rate_,
                                        update_set=update_set)
    train_time = time.time() - start

    if logger:
        logger.info(
            '\ncomputing influence of each training instance on the test loss...'
        )

    # compute influence of each training instance on the test instance
    with timeout(seconds=args.max_time):
        try:
            start = time.time()

            contributions = []
            buf = deepcopy(explainer)

            for i in range(X_train.shape[0]):
                explainer.fit(removed_point_idx=i, destination_model=buf)
                contributions.append(
                    buf.loss_derivative(X_test[test_ndx], y_test[test_ndx])[0])

                # display progress
                if logger and i % int(
                        X_train.shape[0] * frac_progress_update) == 0:
                    elapsed = time.time() - start
                    train_frac_complete = i / X_train.shape[0] * 100
                    logger.info('Train {:.1f}%...{:.3f}s'.format(
                        train_frac_complete, elapsed))

            contributions = np.array(contributions)
            test_time = time.time() - start

        except TimeoutError:
            if logger:
                logger.info('Leaf Influence test time exceeded!')
                exit(0)

    # clean up
    shutil.rmtree(temp_dir)

    # result object
    result = {'train_time': train_time, 'test_time': test_time}

    return result
예제 #5
0
파일: cleaning.py 프로젝트: jjbrophy47/trex
def leaf_influence_method(args, model_noisy, y_train_noisy,
                          noisy_indices, n_check, n_checkpoint,
                          clf, X_train, y_train, X_test, y_test,
                          acc_noisy, auc_noisy, logger=None,
                          k=-1, update_set='AllPoints', out_dir='.',
                          frac_progress_update=0.1):
    """
    Computes the influence on train instance i if train
    instance i were upweighted/removed. This uses the FastLeafInfluence
    method by Sharchilev et al.

    Reference:
    https://github.com/kohpangwei/influence-release/blob/master/influence/experiments.py
    """
    assert args.model == 'cb', 'tree-ensemble is not a CatBoost model!'

    # LeafInfluence settings
    if 'fast' in args.method:
        k = 0
        update_set = 'SinglePoint'

    # save CatBoost model
    temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4())))
    temp_fp = os.path.join(temp_dir, 'cb.json')
    os.makedirs(temp_dir, exist_ok=True)
    model_noisy.save_model(temp_fp, format='json')

    # initialize explainer
    explainer = CBLeafInfluenceEnsemble(temp_fp,
                                        X_train,
                                        y_train_noisy,
                                        k=k,
                                        learning_rate=model_noisy.learning_rate_,
                                        update_set=update_set)

    # display progress
    if logger:
        logger.info('\ncomputing self-influence of training instances...')
        start = time.time()

    # score container
    influence_scores = []

    # compute self-influence score for each training instance
    buf = deepcopy(explainer)
    for i in range(X_train.shape[0]):
        explainer.fit(removed_point_idx=i, destination_model=buf)
        influence_scores.append(buf.loss_derivative(X_train[[i]], y_train_noisy[[i]])[0])

        # display progress
        if logger and i % int(X_train.shape[0] * frac_progress_update) == 0:
            elapsed = time.time() - start
            logger.info('finished {:.1f}% train instances...{:.3f}s'.format((i / X_train.shape[0]) * 100, elapsed))

    # convert scores to a numpy array
    influence_scores = np.array(influence_scores)

    # sort by ascending order; the most negative train instances
    # are the ones that increase the log loss the most, and are the most harmful
    train_indices = np.argsort(influence_scores)
    result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint,
                                 clf, X_train, y_train, X_test, y_test,
                                 acc_noisy, auc_noisy, logger=logger)

    # clean up
    shutil.rmtree(temp_dir)

    return result
예제 #6
0
def experiment(args, logger, out_dir, seed):
    """
    Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train
    instances using various methods, and computes how effective each method is at cleaning the data.
    """

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset,
                                                                 random_state=seed,
                                                                 data_dir=args.data_dir)

    # reduce train size
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]
    data = X_train, y_train, X_test, y_test

    logger.info('no. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # add noise
    y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed)
    noisy_ndx = np.array(sorted(noisy_ndx))
    logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx)))

    # train a tree ensemble on the clean and noisy labels
    model = clone(clf).fit(X_train, y_train)
    model_noisy = clone(clf).fit(X_train, y_train_noisy)

    # show model performance before and after noise
    logger.info('\nBefore noise:')
    model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)
    logger.info('\nAfter noise:')
    model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger)

    # check accuracy before and after noise
    acc_test_clean = accuracy_score(y_test, model.predict(X_test))
    acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test))

    # find how many corrupted/non-corrupted labels were incorrectly predicted
    if not args.true_label:
        logger.info('\nUsing predicted labels:')
        predicted_labels = model_noisy.predict(X_train).flatten()
        incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0]
        incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx)
        logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0]))
        logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0]))

    # number of checkpoints to record
    n_check = int(len(y_train) * args.check_pct)
    interval = (n_check / len(y_train)) / args.n_plot_points

    # random method
    logger.info('\nordering by random...')
    start = time.time()
    ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval,
                                       to_check=n_check,
                                       random_state=seed)
    check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)
    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'random.npy'), random_res)

    # save global lines
    np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean)
    np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct)

    # tree loss method
    logger.info('\nordering by tree loss...')
    start = time.time()

    y_train_proba = model_noisy.predict_proba(X_train)
    ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
    _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res)

    # trex method
    if args.trex:
        logger.info('\nordering by TREX...')
        start = time.time()
        explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy,
                                       tree_kernel=args.tree_kernel,
                                       random_state=seed,
                                       true_label=args.true_label,
                                       kernel_model=args.kernel_model,
                                       verbose=args.verbose,
                                       val_frac=args.val_frac,
                                       logger=logger)

        ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval)
        check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), trex_res)

        # trex loss method
        logger.info('\nordering by TREX loss...')
        start = time.time()

        y_train_proba = explainer.predict_proba(X_train)
        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res)

    # influence method
    if args.tree_type == 'cb' and args.inf_k is not None:
        logger.info('\nordering by leafinfluence...')
        start = time.time()

        model_path = '.model.json'
        model_noisy.save_model(model_path, format='json')

        if args.inf_k == -1:
            update_set = 'AllPoints'
        elif args.inf_k == 0:
            update_set = 'SinglePoint'
        else:
            update_set = 'TopKLeaves'

        leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k,
                                                 learning_rate=model.learning_rate_, update_set=update_set)
        ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy,
                                                    interval, to_check=n_check)
        _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res)

    # MAPLE method
    if args.maple:
        logger.info('\nordering by MAPLE...')
        start = time.time()

        train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train)
        maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False)
        ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval,
                                                                 to_check=n_check)
        _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), maple_res)

    # TEKNN method
    if args.teknn:
        logger.info('\nordering by teknn...')
        start = time.time()

        # transform the data
        extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel)
        X_train_alt = extractor.fit_transform(X_train)
        train_label = y_train if args.true_label else model_noisy.predict(X_train)

        # tune and train teknn
        knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac,
                                    seed=seed, logger=logger)

        ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check)
        _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), teknn_res)

        # TEKNN loss method
        logger.info('\nordering by teknn loss...')
        start = time.time()
        y_train_proba = knn_clf.predict_proba(X_train_alt)

        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res)

    # MMD-Critic method
    if args.mmd:
        logger.info('\nordering by mmd-critic...')
        start = time.time()
        ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), mmd_res)

    # Prototype method
    if args.proto:
        logger.info('\nordering by proto...')
        start = time.time()
        ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), proto_res)