Ejemplo n.º 1
0
    def test_dataset_works(self):
        data_home = dt.get_data_home(data_home=None, subdirectory='test')
        for set_name, variant in random.sample(ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST):
            X, y, feature_names, label_names = dt.load_dataset(set_name=set_name, variant=variant , data_home=data_home)
            self.assertEqual(len(X.shape), 2)
            self.assertEqual(len(y.shape), 2)
            self.assertEqual(len(feature_names), X.shape[1])
            self.assertEqual(len(label_names), y.shape[1])
            self.assertEqual(X.shape[0], y.shape[0])

        dt.clear_data_home(data_home)
    def test_dataset_works(self):
        data_home = dt.get_data_home(data_home=None, subdirectory='test')
        for set_name, variant in random.sample(
                ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST):
            X, y, feature_names, label_names = dt.load_dataset(
                set_name=set_name, variant=variant, data_home=data_home)
            self.assertEqual(len(X.shape), 2)
            self.assertEqual(len(y.shape), 2)
            self.assertEqual(len(feature_names), X.shape[1])
            self.assertEqual(len(label_names), y.shape[1])
            self.assertEqual(X.shape[0], y.shape[0])

        dt.clear_data_home(data_home)
Ejemplo n.º 3
0
    def test_dataset_works(self):
        data_home = dt.get_data_home(data_home=None, subdirectory='test')
        for set_name, variant in random.sample(
                ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST):
            data_dump = dt.load_dataset(set_name=set_name,
                                        variant=variant,
                                        data_home=data_home)
            self.assertIsInstance(data_dump, dict)
            self.assertIn('X', data_dump)
            self.assertIn('y', data_dump)
            self.assertEqual(len(data_dump['X'].shape), 2)
            self.assertEqual(len(data_dump['y'].shape), 2)

        dt.clear_data_home(data_home)
Ejemplo n.º 4
0
        num_instance = emb_feature.shape[0]
        c1 = emb_feature - emb_label
        c2 = tf.linalg.matmul(emb_feature, emb_feature,
                              transpose_b=True) - tf.eye(num_instance)
        c3 = tf.linalg.matmul(emb_label, emb_label,
                              transpose_b=True) - tf.eye(num_instance)
        loss = tf.linalg.trace(tf.matmul(
            c1, c1,
            transpose_a=True)) + self.regularization_factor * tf.linalg.trace(
                tf.matmul(c2, c2, transpose_a=True) +
                tf.matmul(c3, c3, transpose_a=True))
        return loss


#%%
X, y, feature_names, label_names = load_dataset('tmc2007_500', 'train')
#%%
c2ae = C2AE([512, 512], [512], [22])
cl = CCA_Loss()
max_iter = 1000
# optimizer = K.optimizers.Adam(learning_rate=0.001)
optimizer = K.optimizers.RMSprop(learning_rate=0.001)
#%%
tmc2007 = tf.data.Dataset.from_tensor_slices(
    (np.array(X.todense(),
              dtype=np.float32), np.array(y.todense(), dtype=np.float32)))
#%%
BATCH_SIZE = 1000
tmc_data = tmc2007.shuffle(buffer_size=15000)
tmc_data_batch = tmc_data.batch(BATCH_SIZE)
feature, label = next(iter(tmc_data_batch))
Ejemplo n.º 5
0
    def splitted_tmc(self,
                     N,
                     Km,
                     Ksm,
                     shuffle=False,
                     verbose=True,
                     focus="pref"):
        X_tr, y_tr, feature_names, label_names = load_dataset(
            'tmc2007_500', 'train')
        X_te, y_te, _, _ = load_dataset('tmc2007_500', 'test')

        X_tr = X_tr.todense()
        X_te = X_te.todense()
        y_tr = y_tr.todense()
        y_te = y_te.todense()
        if verbose:
            print("Shape of Train Data: ", X_tr.shape)
            print("Shape of Test Data: ", X_te.shape)
            print("Shape of Train Labels: ", y_tr.shape)
            print("Shape of Test Data: ", y_te.shape)

        X = np.concatenate((X_tr, X_te), axis=0)
        y = np.concatenate((y_tr, y_te), axis=0)

        most_feat, least_feat = self.partition_features(Km, X, y)
        if verbose:
            print("\nMost Relevant ", Km, " Features:", most_feat)
            #print("\nLeast Relevant ", X.shape[1]-Km ," Features:", least_feat)

        red_X = X.copy()
        red_X[:, most_feat] = 1

        red_most_feat, red_least_feat = self.partition_features(Ksm, red_X, y)
        if verbose:
            print("\nSecond Most Relevant ", Ksm, " Features:", red_most_feat)
            #print("\nLeast Relevant ", X.shape[1]-Ksm ," Features:", red_least_feat)

        if focus == "pref":
            pref = X[:, most_feat]
            context = X[:, red_most_feat]
        elif focus == "context":
            context = X[:, most_feat]
            pref = X[:, red_most_feat]

        if verbose:
            print("\n Preferences Shape: ", pref.shape)
            print("\n Contexts Shape: ", context.shape)

        if shuffle:
            c = list(zip(context, pref, y))
            np.random.shuffle(c)
            context, pref, y = zip(*c)
            context = np.array(context).squeeze(axis=1)
            pref = np.array(pref).squeeze(axis=1)
            y = np.array(y).squeeze(axis=1)
        else:
            context = np.array(context)
            pref = np.array(pref)
            y = np.array(y)
        if verbose:
            print("\n Contexts Shape: ", context.shape)
            print("\n Preferences Shape: ", pref.shape)
            print("\n Actions Shape: ", y.shape)

        if verbose:
            plt.rcParams["figure.figsize"] = 16, 4
            plt.bar(range(context.shape[1]),
                    np.asarray(context.sum(axis=0)),
                    label="Contexts")
            plt.legend(prop={'size': 20})
            plt.show()
            plt.bar(range(pref.shape[1]),
                    np.asarray(pref.sum(axis=0)),
                    label="Preferences")
            plt.legend(prop={'size': 20})
            plt.show()
            plt.bar(range(y.shape[1]),
                    np.asarray(y.sum(axis=0)),
                    label="Responses")
            plt.legend(prop={'size': 20})
            plt.show()

        sp_context = np.array_split(context, N)
        sp_pref = np.array_split(pref, N)
        sp_response = np.array_split(y, N)

        return sp_context, sp_pref, sp_response
Ejemplo n.º 6
0
    def splitted_mediamill(self,
                           N,
                           red_K,
                           shuffle=False,
                           verbose=True,
                           focus="pref"):
        X_tr, y_tr, feature_names, label_names = load_dataset(
            'mediamill', 'train')
        X_te, y_te, _, _ = load_dataset('mediamill', 'test')

        X_tr = X_tr.todense()
        X_te = X_te.todense()
        y_tr = y_tr.todense()
        y_te = y_te.todense()
        if verbose:
            print("Shape of Train Data: ", X_tr.shape)
            print("Shape of Test Data: ", X_te.shape)
            print("Shape of Train Labels: ", y_tr.shape)
            print("Shape of Test Data: ", y_te.shape)

        X = np.concatenate((X_tr, X_te), axis=0)
        y = np.concatenate((y_tr, y_te), axis=0)
        y = y[:, np.asarray(y.sum(axis=0) > 100)[0]]
        if verbose:
            print("Shape of All Data:", X.shape)
            print("Shape of All Labels:", y.shape)

        K = y.shape[1]
        most_feat, least_feat = self.partition_features(K, X, y)
        if verbose:
            print("\nMost Relevant ", K, " Features:", most_feat)
            print("\nLeast Relevant ", X.shape[1] - K, " Features:",
                  least_feat)

        if focus == "pref":
            pref = (X[:, most_feat] > 0.45).astype(float)
            context = (X[:, least_feat] > 0.45).astype(float)
            if verbose:
                print("\n Preferences Shape: ", pref.shape)
                print("\n Contexts Shape: ", context.shape)

            pref = pref[:, np.asarray(pref.sum(axis=0) > 2400)[0]]
            y = y[:, np.asarray(y.sum(axis=0) > 450)[0]]
            context = context[:,
                              np.asarray(
                                  np.logical_and(
                                      context.sum(axis=0) > 2000,
                                      context.sum(axis=0) < 40000))[0]]

        elif focus == "context":
            context = (X[:, most_feat] > 0.45).astype(float)
            pref = (X[:, least_feat] > 0.45).astype(float)
            if verbose:
                print("\n Preferences Shape: ", pref.shape)
                print("\n Contexts Shape: ", context.shape)

            pref = pref[:, np.asarray(pref.sum(axis=0) > 500)[0]]
            y = y[:, np.asarray(y.sum(axis=0) > 450)[0]]
            context = context[:,
                              np.asarray(
                                  np.logical_and(
                                      context.sum(axis=0) > 9999,
                                      context.sum(axis=0) < 29000))[0]]

        if shuffle:
            c = list(zip(context, pref, y))
            np.random.shuffle(c)
            context, pref, y = zip(*c)
            context = np.array(context).squeeze(axis=1)
            pref = np.array(pref).squeeze(axis=1)
            y = np.array(y).squeeze(axis=1)
        else:
            context = np.array(context)
            pref = np.array(pref)
            y = np.array(y)
        if verbose:
            print("\n Contexts Shape: ", context.shape)
            print("\n Preferences Shape: ", pref.shape)
            print("\n Actions Shape: ", y.shape)

        matrix_clusterer = MatrixLabelSpaceClusterer(clusterer=KMeans(
            n_clusters=red_K))
        similar_ys = matrix_clusterer.fit_predict(context, y)
        if verbose:
            print("Silimar Labeles: ", similar_ys)

        y_red = np.zeros((y.shape[0], red_K))
        for k, lbs in enumerate(similar_ys):
            for lb in lbs:
                y_red[:, k] += y[:, lb]
        y_red = (y_red >= 1).astype(float)

        if verbose:
            plt.rcParams["figure.figsize"] = 16, 4
            plt.bar(range(context.shape[1]),
                    np.asarray(context.sum(axis=0)),
                    label="Contexts")
            plt.legend(prop={'size': 20})
            plt.show()
            plt.bar(range(pref.shape[1]),
                    np.asarray(pref.sum(axis=0)),
                    label="Preferences")
            plt.legend(prop={'size': 20})
            plt.show()
            plt.bar(range(y_red.shape[1]),
                    np.asarray(y_red.sum(axis=0)),
                    label="Responses")
            plt.legend(prop={'size': 20})
            plt.show()

        sp_context = np.array_split(context, N)
        sp_pref = np.array_split(pref, N)
        sp_response = np.array_split(y_red, N)

        return sp_context, sp_pref, sp_response
Ejemplo n.º 7
0
# Adapted from http://scikit.ml/multilabeldnn.html

import numpy
import sklearn.metrics as metrics
from skmultilearn.dataset import load_dataset
from keras.models import Sequential
from keras.layers import Dense
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.ext import Keras
from sklearn.metrics import accuracy_score

X_train, y_train, feature_names, label_names = load_dataset(
    'emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')


def create_model_single_class(input_dim, output_dim):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(output_dim, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


def create_model_multiclass(input_dim, output_dim):
import jax
import numpy as np
from sklearn.preprocessing import StandardScaler
from skmultilearn.dataset import load_dataset

from MCRegressor import MCRegressor
from StochasticILE import StochasticILEMLClassifier

dataset = "emotions"

# load dataset
X_train, y_train, _, _ = load_dataset(dataset, "undivided")

# need to transform to dense arrays
y_train = y_train.toarray()
X_train = X_train.toarray()

# normalise data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# add an additional column of 1s to the data for the bias
X_train = np.c_[X_train, np.ones(X_train.shape[0])]

# Bound parameters (these are checked before fitting)
alpha = 0.25
t = 0.5
kappa = 22

# training parameters
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from skmultilearn.ensemble import RakelD, RakelO
from skmultilearn.adapt import MLkNN
from skmultilearn.dataset import load_dataset
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

## Chargement des données emotions

Xtrain, Ytrain, nom_variable, nom_label = load_dataset(
    'emotions', 'train')  # données train
Xtest, Ytest, _, _ = load_dataset('emotions', 'test')  # données test
## description des données sur le line suivant : http://scikit.ml/tutorial.html
#Xtrain,Xtest,Ytrain,Ytest sont chargé comme matrice spase de scipy
''' On pourrais travaillé directement avec des matrices sparse avec sklearn et skmultilearn'''

## Tansformation des matrice sparse pour la visualisation des données
datatrain = pd.DataFrame(Xtrain.toarray(), columns=nom_variable)
print(datatrain.shape)  # (391,72)
#print(datatrain.head(5))
#print(datatrain.info())
#print(datatrain.describe())
datatrainy = pd.DataFrame(Ytrain.toarray(), columns=nom_label)
print(datatrainy.shape)  #6 sorties donc 6 label non exculsives

## prétraitement des données
def import_testing_set(dataset_name):
    dataset = load_dataset(dataset_name, "test")

    return cons_multilabel_dataset_mock(dataset)
def load_given_dataset(dataset):
    if dataset.lower() == "20ng":
        return load_custom_dataset("20ng")
    if dataset.lower() == "test":
        return load_custom_dataset("test")
    return load_dataset(dataset, 'undivided')
def load_given_dataset(d):
    if d.lower() == "20ng":
        return load_custom_dataset(d.lower())
    return load_dataset(d, 'undivided')
Ejemplo n.º 13
0
def get_dataset(dataset, ranked_features=None, reduce_dim=None, num_features=None):
    # Load a multi-label dataset from https://www.openml.org/d/40597
    # X, Y = fetch_mldata('yeast', version=4, return_X_y=True)
    if dataset == "yeast":
        data = fetch_mldata("yeast")
        X = data["data"]
        Y = data["target"].transpose().toarray()

        train_input, test_input, train_labels, test_labels = train_test_split(
            X, Y, test_size=0.2, random_state=0
        )

        if reduce_dim is not None and reduce_dim < train_input.shape[1]:
            from sklearn.decomposition import PCA

            pca = PCA(n_components=reduce_dim)
            pca.fit(train_input)
            train_input = pca.transform(train_input)

            pca.fit(test_input)
            test_input = pca.transform(test_input)

        return train_input, test_input, train_labels, test_labels
    elif dataset == "emotions":
        train_input, train_labels, feature_names, label_names = load_dataset(
            "emotions", "train"
        )
        test_input, test_labels, _, _ = load_dataset("emotions", "test")
        fimp = Fimp(
            f_name=str(Path(os.path.dirname(__file__)).parent)
                   + "/data/multilabel/emotions.fimp"
        )
        indices = np.asarray(fimp.get_attr_indices())[range(-1, 2)] - 1

        return (
            train_input.toarray(),
            test_input.toarray(),
            train_labels.toarray(),
            test_labels.toarray(),
        )

    elif ds_name == "xor":
        dataIn, dataOut = get_xor()
        return train_test_split(dataIn, dataOut, random_state=17)
    elif ds_name == "moons":
        dataIn, dataOut = make_moons(
            n_samples=20000, shuffle=True, noise=0.1, random_state=17
        )
        dataOut = np.vstack((dataOut, dataOut, dataOut)).T
        return train_test_split(dataIn, dataOut, random_state=17)
    elif ds_name == "sys_multilabel":
        dataIn, dataOut = get_sy_multilabel()
        return train_test_split(dataIn, dataOut, random_state=17)
    else:
        train_input, train_labels, test_input, test_labels = get_categorical_data(
            dataset
        )
        train_input = np.asarray(train_input, dtype=np.float)
        train_labels = np.asarray(train_labels, dtype=np.int)
        test_input = np.asarray(test_input, dtype=np.float)
        test_labels = np.asarray(test_labels, dtype=np.int)

        return train_input, test_input, train_labels, test_labels