def tree_loss_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None): """ Orders training instances by largest loss. """ y_train_proba = model_noisy.predict_proba(X_train)[:, 1] y_train_noisy_loss = util.instance_log_loss( y_train_noisy, y_train_proba) # negative log-likelihood train_indices = np.argsort( y_train_noisy_loss)[::-1] # descending, largest log loss first result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) return result
def teknn_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None, frac_progress_update=0.1): """ Count impact by the number of times a training sample shows up in one another's neighborhood, this can be weighted by 1 / distance. """ # train surrogate model params = {'C': args.C, 'n_neighbors': args.n_neighbors, 'tree_kernel': args.tree_kernel} train_label = y_train_noisy if 'og' in args.method else model_noisy.predict(X_train) surrogate = trex.train_surrogate(model=model_noisy, surrogate=args.method.split('_')[0], X_train=X_train, y_train=train_label, val_frac=args.tune_frac, metric=args.metric, seed=args.rs, params=params, logger=logger) # sort by instance log loss using the surrogate model if 'loss' in args.method: # display progress if logger: logger.info('\ncomputing KNN loss...') y_train_proba = surrogate.predict_proba(X_train)[:, 1] y_train_noisy_loss = util.instance_log_loss(y_train_noisy, y_train_proba) # negative log-likelihood train_indices = np.argsort(y_train_noisy_loss)[::-1] # descending, largest log loss first # sort by absolute value of instance weights else: # sort instances based on largest influence toward the predicted training labels attributions = surrogate.compute_attributions(X_train, logger=logger) attributions_sum = np.sum(attributions, axis=0) train_indices = np.argsort(attributions_sum)[::-1] # fix noisy instances result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) return result
def trex_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None): """ Order by largest absolute values of the instance coefficients from the KLR or SVM surrogate model. """ # train surrogate model params = {'C': args.C, 'n_neighbors': args.n_neighbors, 'tree_kernel': args.tree_kernel} train_label = y_train_noisy if 'og' in args.method else model_noisy.predict(X_train) surrogate = trex.train_surrogate(model=model_noisy, surrogate=args.method.split('_')[0], X_train=X_train, y_train=train_label, val_frac=args.tune_frac, metric=args.metric, seed=args.rs, params=params, logger=logger) # sort by instance log loss using the surrogate model if 'loss' in args.method: y_train_proba = surrogate.predict_proba(X_train)[:, 1] y_train_noisy_loss = util.instance_log_loss(y_train_noisy, y_train_proba) # negative log-likelihood train_indices = np.argsort(y_train_noisy_loss)[::-1] # descending, largest log loss first # sort by absolute value of instance weights else: train_indices = np.argsort(np.abs(surrogate.get_alpha()))[::-1] # fix noisy instances result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) return result
def trex_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None, out_dir=None): """ Order by largest absolute values of the instance coefficients from the KLR or SVM surrogate model. """ # train surrogate model params = util.get_selected_params(dataset=args.dataset, model=args.model, surrogate=args.method) train_label = y_train_noisy if 'og' in args.method else model_noisy.predict( X_train) surrogate = trex.train_surrogate(model=model_noisy, surrogate='klr', X_train=X_train, y_train=train_label, val_frac=args.tune_frac, metric=args.metric, seed=args.rs, params=params, logger=None) # sort by instance log loss using the surrogate model if 'loss' in args.method: y_train_proba = surrogate.predict_proba(X_train)[:, 1] y_train_noisy_loss = util.instance_log_loss( y_train_noisy, y_train_proba) # negative log-likelihood train_indices = np.argsort( y_train_noisy_loss)[::-1] # descending, largest log loss first # sort by sim. or alpha else: logger.info('\nsorting train instances...') # sort by similarity if 'sim' in args.method: start = time.time() # prioritize largest absolute similarity density if 'abs' in args.method: similarity_density = np.zeros(0, ) n_chunk = int(X_train.shape[0] * 0.1) for i in range(0, X_train.shape[0], n_chunk): X_sub_sim = surrogate.similarity(X_train[i:i + n_chunk]) similarity_density_sub = np.sum(X_sub_sim, axis=1) similarity_density = np.concatenate( [similarity_density, similarity_density_sub]) elapsed = time.time() - start logger.info('{:.1f}%...{:.3f}s'.format( i / X_train.shape[0] * 100, elapsed)) similarity_density = np.abs(similarity_density) train_indices = np.argsort(similarity_density)[::-1] # sort train instances prioritizing largest negative similarity density else: similarity_density = np.zeros(0, ) n_chunk = int(X_train.shape[0] * 0.1) for i in range(0, X_train.shape[0], n_chunk): X_sub_sim = surrogate.similarity(X_train[i:i + n_chunk]) y_sub_mask = np.ones(X_sub_sim.shape) for j in range(y_sub_mask.shape[0]): y_sub_mask[j][np.where( y_train_noisy[j + i] != y_train_noisy)] = -1 X_sub_sim = X_sub_sim * y_sub_mask similarity_density_sub = np.sum(X_sub_sim, axis=1) similarity_density = np.concatenate( [similarity_density, similarity_density_sub]) elapsed = time.time() - start logger.info('{:.1f}%...{:.3f}s'.format( i / X_train.shape[0] * 100, elapsed)) train_indices = np.argsort(similarity_density) # plot |alpha| vs. similarity density if out_dir is not None: alpha = surrogate.get_alpha() alpha = np.abs(alpha) non_noisy_indices = np.setdiff1d(np.arange(y_train.shape[0]), noisy_indices) fig, ax = plt.subplots() ax.scatter(alpha[non_noisy_indices], similarity_density[non_noisy_indices], alpha=0.1, label='non-noisy', color='green') ax.scatter(alpha[noisy_indices], similarity_density[noisy_indices], alpha=0.1, label='noisy', color='red') ax.set_xlabel('alpha') ax.set_ylabel('similarity_density') ax.legend() plt.savefig(os.path.join(out_dir, 'alpha_sim.png')) elif 'alpha' in args.method: alpha = surrogate.get_alpha() magnitude = np.abs(alpha) train_indices = np.argsort(magnitude)[::-1] else: raise ValueError('unknown method {}'.format(args.method)) # fix noisy instances result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) return result