コード例 #1
0
ファイル: cleaning_new.py プロジェクト: jjbrophy47/trex
def tree_loss_method(args,
                     model_noisy,
                     y_train_noisy,
                     noisy_indices,
                     n_check,
                     n_checkpoint,
                     clf,
                     X_train,
                     y_train,
                     X_test,
                     y_test,
                     acc_noisy,
                     auc_noisy,
                     logger=None):
    """
    Orders training instances by largest loss.
    """
    y_train_proba = model_noisy.predict_proba(X_train)[:, 1]
    y_train_noisy_loss = util.instance_log_loss(
        y_train_noisy, y_train_proba)  # negative log-likelihood
    train_indices = np.argsort(
        y_train_noisy_loss)[::-1]  # descending, largest log loss first
    result = fix_noisy_instances(train_indices,
                                 noisy_indices,
                                 n_check,
                                 n_checkpoint,
                                 clf,
                                 X_train,
                                 y_train,
                                 X_test,
                                 y_test,
                                 acc_noisy,
                                 auc_noisy,
                                 logger=logger)
    return result
コード例 #2
0
ファイル: cleaning.py プロジェクト: jjbrophy47/trex
def teknn_method(args, model_noisy, y_train_noisy,
                 noisy_indices, n_check, n_checkpoint,
                 clf, X_train, y_train, X_test, y_test,
                 acc_noisy, auc_noisy, logger=None,
                 frac_progress_update=0.1):
    """
    Count impact by the number of times a training sample shows up in
    one another's neighborhood, this can be weighted by 1 / distance.
    """

    # train surrogate model
    params = {'C': args.C, 'n_neighbors': args.n_neighbors, 'tree_kernel': args.tree_kernel}
    train_label = y_train_noisy if 'og' in args.method else model_noisy.predict(X_train)
    surrogate = trex.train_surrogate(model=model_noisy,
                                     surrogate=args.method.split('_')[0],
                                     X_train=X_train,
                                     y_train=train_label,
                                     val_frac=args.tune_frac,
                                     metric=args.metric,
                                     seed=args.rs,
                                     params=params,
                                     logger=logger)

    # sort by instance log loss using the surrogate model
    if 'loss' in args.method:

        # display progress
        if logger:
            logger.info('\ncomputing KNN loss...')

        y_train_proba = surrogate.predict_proba(X_train)[:, 1]
        y_train_noisy_loss = util.instance_log_loss(y_train_noisy, y_train_proba)  # negative log-likelihood
        train_indices = np.argsort(y_train_noisy_loss)[::-1]  # descending, largest log loss first

    # sort by absolute value of instance weights
    else:

        # sort instances based on largest influence toward the predicted training labels
        attributions = surrogate.compute_attributions(X_train, logger=logger)
        attributions_sum = np.sum(attributions, axis=0)
        train_indices = np.argsort(attributions_sum)[::-1]

    # fix noisy instances
    result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint,
                                 clf, X_train, y_train, X_test, y_test,
                                 acc_noisy, auc_noisy, logger=logger)
    return result
コード例 #3
0
ファイル: cleaning.py プロジェクト: jjbrophy47/trex
def trex_method(args, model_noisy, y_train_noisy,
                noisy_indices, n_check, n_checkpoint,
                clf, X_train, y_train, X_test, y_test,
                acc_noisy, auc_noisy, logger=None):
    """
    Order by largest absolute values of the instance coefficients
    from the KLR or SVM surrogate model.
    """

    # train surrogate model
    params = {'C': args.C, 'n_neighbors': args.n_neighbors, 'tree_kernel': args.tree_kernel}
    train_label = y_train_noisy if 'og' in args.method else model_noisy.predict(X_train)
    surrogate = trex.train_surrogate(model=model_noisy,
                                     surrogate=args.method.split('_')[0],
                                     X_train=X_train,
                                     y_train=train_label,
                                     val_frac=args.tune_frac,
                                     metric=args.metric,
                                     seed=args.rs,
                                     params=params,
                                     logger=logger)

    # sort by instance log loss using the surrogate model
    if 'loss' in args.method:
        y_train_proba = surrogate.predict_proba(X_train)[:, 1]
        y_train_noisy_loss = util.instance_log_loss(y_train_noisy, y_train_proba)  # negative log-likelihood
        train_indices = np.argsort(y_train_noisy_loss)[::-1]  # descending, largest log loss first

    # sort by absolute value of instance weights
    else:
        train_indices = np.argsort(np.abs(surrogate.get_alpha()))[::-1]

    # fix noisy instances
    result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint,
                                 clf, X_train, y_train, X_test, y_test,
                                 acc_noisy, auc_noisy, logger=logger)
    return result
コード例 #4
0
ファイル: cleaning_new.py プロジェクト: jjbrophy47/trex
def trex_method(args,
                model_noisy,
                y_train_noisy,
                noisy_indices,
                n_check,
                n_checkpoint,
                clf,
                X_train,
                y_train,
                X_test,
                y_test,
                acc_noisy,
                auc_noisy,
                logger=None,
                out_dir=None):
    """
    Order by largest absolute values of the instance coefficients
    from the KLR or SVM surrogate model.
    """

    # train surrogate model
    params = util.get_selected_params(dataset=args.dataset,
                                      model=args.model,
                                      surrogate=args.method)
    train_label = y_train_noisy if 'og' in args.method else model_noisy.predict(
        X_train)

    surrogate = trex.train_surrogate(model=model_noisy,
                                     surrogate='klr',
                                     X_train=X_train,
                                     y_train=train_label,
                                     val_frac=args.tune_frac,
                                     metric=args.metric,
                                     seed=args.rs,
                                     params=params,
                                     logger=None)

    # sort by instance log loss using the surrogate model
    if 'loss' in args.method:
        y_train_proba = surrogate.predict_proba(X_train)[:, 1]
        y_train_noisy_loss = util.instance_log_loss(
            y_train_noisy, y_train_proba)  # negative log-likelihood
        train_indices = np.argsort(
            y_train_noisy_loss)[::-1]  # descending, largest log loss first

    # sort by sim. or alpha
    else:
        logger.info('\nsorting train instances...')

        # sort by similarity
        if 'sim' in args.method:
            start = time.time()

            # prioritize largest absolute similarity density
            if 'abs' in args.method:
                similarity_density = np.zeros(0, )
                n_chunk = int(X_train.shape[0] * 0.1)
                for i in range(0, X_train.shape[0], n_chunk):
                    X_sub_sim = surrogate.similarity(X_train[i:i + n_chunk])
                    similarity_density_sub = np.sum(X_sub_sim, axis=1)
                    similarity_density = np.concatenate(
                        [similarity_density, similarity_density_sub])

                    elapsed = time.time() - start

                    logger.info('{:.1f}%...{:.3f}s'.format(
                        i / X_train.shape[0] * 100, elapsed))

                similarity_density = np.abs(similarity_density)
                train_indices = np.argsort(similarity_density)[::-1]

            # sort train instances prioritizing largest negative similarity density
            else:
                similarity_density = np.zeros(0, )
                n_chunk = int(X_train.shape[0] * 0.1)
                for i in range(0, X_train.shape[0], n_chunk):
                    X_sub_sim = surrogate.similarity(X_train[i:i + n_chunk])

                    y_sub_mask = np.ones(X_sub_sim.shape)
                    for j in range(y_sub_mask.shape[0]):
                        y_sub_mask[j][np.where(
                            y_train_noisy[j + i] != y_train_noisy)] = -1
                    X_sub_sim = X_sub_sim * y_sub_mask

                    similarity_density_sub = np.sum(X_sub_sim, axis=1)
                    similarity_density = np.concatenate(
                        [similarity_density, similarity_density_sub])

                    elapsed = time.time() - start

                    logger.info('{:.1f}%...{:.3f}s'.format(
                        i / X_train.shape[0] * 100, elapsed))

                train_indices = np.argsort(similarity_density)

            # plot |alpha| vs. similarity density
            if out_dir is not None:

                alpha = surrogate.get_alpha()
                alpha = np.abs(alpha)

                non_noisy_indices = np.setdiff1d(np.arange(y_train.shape[0]),
                                                 noisy_indices)

                fig, ax = plt.subplots()
                ax.scatter(alpha[non_noisy_indices],
                           similarity_density[non_noisy_indices],
                           alpha=0.1,
                           label='non-noisy',
                           color='green')
                ax.scatter(alpha[noisy_indices],
                           similarity_density[noisy_indices],
                           alpha=0.1,
                           label='noisy',
                           color='red')
                ax.set_xlabel('alpha')
                ax.set_ylabel('similarity_density')
                ax.legend()
                plt.savefig(os.path.join(out_dir, 'alpha_sim.png'))

        elif 'alpha' in args.method:
            alpha = surrogate.get_alpha()
            magnitude = np.abs(alpha)
            train_indices = np.argsort(magnitude)[::-1]

        else:
            raise ValueError('unknown method {}'.format(args.method))

    # fix noisy instances
    result = fix_noisy_instances(train_indices,
                                 noisy_indices,
                                 n_check,
                                 n_checkpoint,
                                 clf,
                                 X_train,
                                 y_train,
                                 X_test,
                                 y_test,
                                 acc_noisy,
                                 auc_noisy,
                                 logger=logger)
    return result