def test_experiment_mcc(): mat_obj = sio.loadmat("../data/mcc/chess.mat") x = mat_obj['input_space'] y = mat_obj['target_space'] ## this is not one hot for scc y = y.reshape(-1) wrange = [] for u in range(x.shape[0]): if 2**u <= x.shape[0]: wrange.append(int(2**u)) wrange.append(int(x.shape[0])) relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, use_average_neighbour=True, determine_k_automatically=True, num_iter=wrange) relief_b3_instance.fit(x, y) relief_b3_instance.feature_importances_ all_weights = relief_b3_instance.all_weights print(all_weights) for k, v in relief_b3_instance.timed.items(): print(k + "\t" + str(v))
def test_experiment_arff_mlc(): ova = "../data/test.arff" features = [0, 1, 2, 3, 4] targets = [5, 6] x_train, y_train, _ = load_arff(ova, features, targets) neighbours = 2 print("ReliefF ..") relief_b3_instance = reliefe.ReliefE(embedding_based_distances=False, verbose=True, k=neighbours) relief_b3_instance.fit(x_train, y_train) importances2 = relief_b3_instance.feature_importances_ print(importances2) print("ReliefE ..") relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, k=neighbours, latent_dimension=2) relief_b3_instance.fit(x_train, y_train) importances2 = relief_b3_instance.feature_importances_ print(importances2) x_train, y_train, _ = reliefe.utils.load_arff(ova, features, targets[0]) neighbours = 2 print("ReliefF ..") relief_b3_instance = reliefe.ReliefE(embedding_based_distances=False, verbose=True, k=neighbours) relief_b3_instance.fit(x_train, y_train) importances2 = relief_b3_instance.feature_importances_ print(importances2) print("ReliefE ..") relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, k=neighbours, latent_dimension=2) relief_b3_instance.fit(x_train, y_train) importances2 = relief_b3_instance.feature_importances_ print(importances2)
def test_simple_benchmark(): sns.set_style("whitegrid") data_obj = load_breast_cancer() x = data_obj['data'] y = data_obj['target'] names = data_obj['feature_names'] # let's overfit, just for demo purposes reliefE_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, use_average_neighbour=False, determine_k_automatically=False, mlc_distance="hamming") reliefE_instance.fit(x, y) reliefe_importances = reliefE_instance.feature_importances_ rf_model = RandomForestClassifier() rf_model.fit(x, y) rf = rf_model.feature_importances_ mutual_information = mutual_info_classif(x, y) sorted_rf = np.argsort(rf) sorted_mi = np.argsort(mutual_information) sorted_re = np.argsort(reliefe_importances) names = ["RF", "MI", "ReliefE"] indices = [sorted_rf, sorted_mi, sorted_re] output_struct = {} for name, indice_set in zip(names, indices): scores = [] indice_set = indice_set.tolist() print("Computing evaluations for: {}".format(name)) for j in tqdm.tqdm(range(len(indice_set))): selected_features = indice_set[0:j + 1] subset = x[:, selected_features] clf = LogisticRegression(max_iter=10000000, solver="lbfgs") score = np.mean(cross_val_score(clf, subset, y, cv=10)) scores.append((score, j + 1)) output_struct[name] = scores print("Plotting ..") for k, v in output_struct.items(): indices = [] scores = [] for x, y in v: indices.append(y) scores.append(x) plt.plot(indices, scores, label=k) plt.xlabel("Top features") plt.ylabel("Performance (Accuracy)") plt.legend() plt.show()
def test_experiment_mlc_autok(dataset): mat_obj = sio.loadmat(dataset) x = mat_obj['input_space'] y = mat_obj['target_space'] ## this is not one hot for scc assert y.shape[1] > 1 reliefe_instance = reliefe.ReliefE(embedding_based_distances=False, verbose=True, use_average_neighbour=False, determine_k_automatically=True, num_iter=50) reliefe_instance.fit(x, y) assert len(reliefe_instance.feature_importances_) > 0
def test_custom_embedding(): mat_obj = sio.loadmat("../data/mcc/chess.mat") x = mat_obj['input_space'] y = mat_obj['target_space'] ## this is not one hot for scc y = y.reshape(-1) reliefe_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, use_average_neighbour=False, determine_k_automatically=False, num_iter=50) emb_custom = PCA() reliefe_instance.fit(x, y, embedding_method=emb_custom) assert len(reliefe_instance.feature_importances_) > 0
import reliefe import scipy.io as sio # Load the data first mat_obj = sio.loadmat("../data/mlc/Science1.mat") x = mat_obj['input_space'] ## scipy csr sparse matrix (or numpy dense) y = mat_obj['target_space'] ## scipy csr sparse matrix (or numpy dense) reliefE_instance = reliefe.ReliefE(determine_k_automatically=True, embedding_based_distances=True, num_iter=128, verbose=True) # Initialize default ReliefE reliefE_instance.fit(x, y) # Compute rankings print(reliefE_instance.feature_importances_ ) # rankings for features (same order as x) reliefE_instance = reliefe.ReliefE(determine_k_automatically=True, num_iter=128, verbose=True) # Initialize default ReliefE reliefE_instance.fit(x, y) # Compute rankings print(reliefE_instance.feature_importances_ ) # rankings for features (same order as x)
import reliefe import scipy.io as sio from sklearn.decomposition import TruncatedSVD mat_obj = sio.loadmat("../data/mcc/chess.mat") x = mat_obj['input_space'] y = mat_obj['target_space'] ## this is not one hot for scc y = y.reshape(-1) print(x.shape, y.shape) reliefe_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True) # Simply provide a sklearn-like transform object emb_custom = TruncatedSVD() # Let's do SVD # Provide it as the "embedding_method" parameter reliefe_instance.fit(x, y, embedding_method=emb_custom) assert len(reliefe_instance.feature_importances_) > 0 print(reliefe_instance.feature_importances_)
def test_importances(importances, method="ReliefE"): sorted_importances = np.argsort(importances)[::-1] ## If the first two features are amongst the top 3, this is a relatively OK sign. if 0 in sorted_importances[0:3] and 1 in sorted_importances[0:3]: print("Successfully retrieved top features.") return 1, method else: return 0, method df = pd.read_csv("synthetic.txt", sep=",") print(df) ## Features a1 and a2 are here crucial for performance. x = df[[f"a{x}" for x in range(1, 9)]].values y = df.Class.astype(int).values ## ReliefE will revert to the core version in LD settings; the point of reducing the dimension makes no sense in LD; the core version is fast for this type of problems. reliefE_instance = reliefe.ReliefE( embedding_based_distances=True, num_iter=0.05, ## Few-shot? verbose=True, use_average_neighbour=True, determine_k_automatically=True) reliefE_instance.fit(x, y) importances = reliefE_instance.feature_importances_ test_importances(importances, method=f"ReliefE")
import scipy.io as sio import reliefe mat_obj = sio.loadmat("../data/mlc/medical.mat") x = mat_obj['input_space'] y = mat_obj['target_space'] ## this is not one hot for scc wrange = [] # Fully fledged MLC - ReliefE (with all functionality) reliefE_instance = reliefe.ReliefE(embedding_based_distances=True, verbose=True, use_average_neighbour=False, determine_k_automatically=False, mlc_distance="hamming") reliefE_instance.fit(x, y) print(reliefE_instance.feature_importances_)