Ejemplo n.º 1
0
def test_experiment_mcc():

    mat_obj = sio.loadmat("../data/mcc/chess.mat")
    x = mat_obj['input_space']
    y = mat_obj['target_space']  ## this is not one hot for scc
    y = y.reshape(-1)

    wrange = []
    for u in range(x.shape[0]):
        if 2**u <= x.shape[0]:
            wrange.append(int(2**u))
    wrange.append(int(x.shape[0]))

    relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True,
                                         verbose=True,
                                         use_average_neighbour=True,
                                         determine_k_automatically=True,
                                         num_iter=wrange)

    relief_b3_instance.fit(x, y)
    relief_b3_instance.feature_importances_
    all_weights = relief_b3_instance.all_weights
    print(all_weights)
    for k, v in relief_b3_instance.timed.items():
        print(k + "\t" + str(v))
Ejemplo n.º 2
0
def test_experiment_arff_mlc():
    ova = "../data/test.arff"
    features = [0, 1, 2, 3, 4]
    targets = [5, 6]
    x_train, y_train, _ = load_arff(ova, features, targets)
    neighbours = 2

    print("ReliefF ..")
    relief_b3_instance = reliefe.ReliefE(embedding_based_distances=False,
                                         verbose=True,
                                         k=neighbours)
    relief_b3_instance.fit(x_train, y_train)
    importances2 = relief_b3_instance.feature_importances_
    print(importances2)

    print("ReliefE ..")
    relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True,
                                         verbose=True,
                                         k=neighbours,
                                         latent_dimension=2)
    relief_b3_instance.fit(x_train, y_train)
    importances2 = relief_b3_instance.feature_importances_
    print(importances2)

    x_train, y_train, _ = reliefe.utils.load_arff(ova, features, targets[0])
    neighbours = 2

    print("ReliefF ..")
    relief_b3_instance = reliefe.ReliefE(embedding_based_distances=False,
                                         verbose=True,
                                         k=neighbours)
    relief_b3_instance.fit(x_train, y_train)
    importances2 = relief_b3_instance.feature_importances_
    print(importances2)

    print("ReliefE ..")
    relief_b3_instance = reliefe.ReliefE(embedding_based_distances=True,
                                         verbose=True,
                                         k=neighbours,
                                         latent_dimension=2)
    relief_b3_instance.fit(x_train, y_train)
    importances2 = relief_b3_instance.feature_importances_
    print(importances2)
Ejemplo n.º 3
0
def test_simple_benchmark():
    sns.set_style("whitegrid")
    data_obj = load_breast_cancer()
    x = data_obj['data']
    y = data_obj['target']
    names = data_obj['feature_names']

    # let's overfit, just for demo purposes

    reliefE_instance = reliefe.ReliefE(embedding_based_distances=True,
                                       verbose=True,
                                       use_average_neighbour=False,
                                       determine_k_automatically=False,
                                       mlc_distance="hamming")
    reliefE_instance.fit(x, y)
    reliefe_importances = reliefE_instance.feature_importances_
    rf_model = RandomForestClassifier()
    rf_model.fit(x, y)
    rf = rf_model.feature_importances_
    mutual_information = mutual_info_classif(x, y)
    sorted_rf = np.argsort(rf)
    sorted_mi = np.argsort(mutual_information)
    sorted_re = np.argsort(reliefe_importances)

    names = ["RF", "MI", "ReliefE"]
    indices = [sorted_rf, sorted_mi, sorted_re]

    output_struct = {}
    for name, indice_set in zip(names, indices):
        scores = []
        indice_set = indice_set.tolist()
        print("Computing evaluations for: {}".format(name))
        for j in tqdm.tqdm(range(len(indice_set))):
            selected_features = indice_set[0:j + 1]
            subset = x[:, selected_features]
            clf = LogisticRegression(max_iter=10000000, solver="lbfgs")
            score = np.mean(cross_val_score(clf, subset, y, cv=10))
            scores.append((score, j + 1))
        output_struct[name] = scores

    print("Plotting ..")
    for k, v in output_struct.items():
        indices = []
        scores = []
        for x, y in v:
            indices.append(y)
            scores.append(x)
        plt.plot(indices, scores, label=k)
    plt.xlabel("Top features")
    plt.ylabel("Performance  (Accuracy)")
    plt.legend()
    plt.show()
Ejemplo n.º 4
0
def test_experiment_mlc_autok(dataset):

    mat_obj = sio.loadmat(dataset)
    x = mat_obj['input_space']
    y = mat_obj['target_space']  ## this is not one hot for scc
    assert y.shape[1] > 1
    reliefe_instance = reliefe.ReliefE(embedding_based_distances=False,
                                       verbose=True,
                                       use_average_neighbour=False,
                                       determine_k_automatically=True,
                                       num_iter=50)

    reliefe_instance.fit(x, y)
    assert len(reliefe_instance.feature_importances_) > 0
Ejemplo n.º 5
0
def test_custom_embedding():

    mat_obj = sio.loadmat("../data/mcc/chess.mat")
    x = mat_obj['input_space']
    y = mat_obj['target_space']  ## this is not one hot for scc
    y = y.reshape(-1)
    reliefe_instance = reliefe.ReliefE(embedding_based_distances=True,
                                       verbose=True,
                                       use_average_neighbour=False,
                                       determine_k_automatically=False,
                                       num_iter=50)

    emb_custom = PCA()
    reliefe_instance.fit(x, y, embedding_method=emb_custom)
    assert len(reliefe_instance.feature_importances_) > 0
Ejemplo n.º 6
0
import reliefe
import scipy.io as sio

# Load the data first
mat_obj = sio.loadmat("../data/mlc/Science1.mat")
x = mat_obj['input_space']  ## scipy csr sparse matrix (or numpy dense)
y = mat_obj['target_space']  ## scipy csr sparse matrix (or numpy dense)

reliefE_instance = reliefe.ReliefE(determine_k_automatically=True,
                                   embedding_based_distances=True,
                                   num_iter=128,
                                   verbose=True)

# Initialize default ReliefE
reliefE_instance.fit(x, y)  # Compute rankings
print(reliefE_instance.feature_importances_
      )  # rankings for features (same order as x)

reliefE_instance = reliefe.ReliefE(determine_k_automatically=True,
                                   num_iter=128,
                                   verbose=True)

# Initialize default ReliefE
reliefE_instance.fit(x, y)  # Compute rankings
print(reliefE_instance.feature_importances_
      )  # rankings for features (same order as x)
Ejemplo n.º 7
0
import reliefe
import scipy.io as sio
from sklearn.decomposition import TruncatedSVD

mat_obj = sio.loadmat("../data/mcc/chess.mat")
x = mat_obj['input_space']
y = mat_obj['target_space']  ## this is not one hot for scc
y = y.reshape(-1)
print(x.shape, y.shape)

reliefe_instance = reliefe.ReliefE(embedding_based_distances=True,
                                   verbose=True)

# Simply provide a sklearn-like transform object
emb_custom = TruncatedSVD()  # Let's do SVD

# Provide it as the "embedding_method" parameter
reliefe_instance.fit(x, y, embedding_method=emb_custom)
assert len(reliefe_instance.feature_importances_) > 0
print(reliefe_instance.feature_importances_)
Ejemplo n.º 8
0
def test_importances(importances, method="ReliefE"):
    sorted_importances = np.argsort(importances)[::-1]

    ## If the first two features are amongst the top 3, this is a relatively OK sign.
    if 0 in sorted_importances[0:3] and 1 in sorted_importances[0:3]:
        print("Successfully retrieved top features.")
        return 1, method
    else:
        return 0, method


df = pd.read_csv("synthetic.txt", sep=",")
print(df)

## Features a1 and a2 are here crucial for performance.
x = df[[f"a{x}" for x in range(1, 9)]].values
y = df.Class.astype(int).values

## ReliefE will revert to the core version in LD settings; the point of reducing the dimension makes no sense in LD; the core version is fast for this type of problems.
reliefE_instance = reliefe.ReliefE(
    embedding_based_distances=True,
    num_iter=0.05,  ## Few-shot?
    verbose=True,
    use_average_neighbour=True,
    determine_k_automatically=True)

reliefE_instance.fit(x, y)
importances = reliefE_instance.feature_importances_
test_importances(importances, method=f"ReliefE")
Ejemplo n.º 9
0
import scipy.io as sio
import reliefe

mat_obj = sio.loadmat("../data/mlc/medical.mat")
x = mat_obj['input_space']
y = mat_obj['target_space']  ## this is not one hot for scc
wrange = []

# Fully fledged MLC - ReliefE (with all functionality)
reliefE_instance = reliefe.ReliefE(embedding_based_distances=True,
                                   verbose=True,
                                   use_average_neighbour=False,
                                   determine_k_automatically=False,
                                   mlc_distance="hamming")
reliefE_instance.fit(x, y)
print(reliefE_instance.feature_importances_)