Beispiel #1
0
def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [cat[idx] if 0 <= idx < len(cat) else None for idx in
                      data_bunch.data[:, col_idx].astype(int)]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'],
                                    sparse, None, False)
    data_downloaded = np.array(data_arff['data'], dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i))
Beispiel #2
0
def test_fetch_openml_notarget(monkeypatch, gzip_response):
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(data_id=data_id, target_column=target_column,
                        cache=False)
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None
Beispiel #3
0
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached)
def load_mnist(n_samples=None, class_0='0', class_1='8'):
    """Load MNIST, select two classes, shuffle and return only n_samples."""
    # Load data from http://openml.org/d/554
    mnist = fetch_openml('mnist_784', version=1)

    # take only two classes for binary classification
    mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)

    X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42)
    if n_samples is not None:
        X, y = X[:n_samples], y[:n_samples]
    return X, y
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_openml('mnist_784')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
    X /= 255
    return X, y
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
              for i in range(5)]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
                                 effective_rank=100, tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity/10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name).data
    return X
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_openml('mnist_784')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test
def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs
from pykeops.torch import LazyTensor

use_cuda = torch.cuda.is_available()
print(use_cuda)
tensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

######################################################################
# Load the MNIST dataset: 70,000 images of shape (28,28).

try:
    from sklearn.datasets import fetch_openml
except ImportError:
    raise ImportError("This tutorial requires Scikit Learn version >= 0.20.")

mnist = fetch_openml('mnist_784', cache=False)

x = tensor(mnist.data.astype('float32'))
print(x.shape())
y = tensor(mnist.target.astype('int64'))
print(y.shape())

######################################################################
# Split it into a train and test set:

D = x.shape[1]
Ntrain, Ntest = (60000, 10000) if use_cuda else (1000, 100)
x_train, y_train = x[:Ntrain, :], y[:Ntrain]
x_test, y_test = x[Ntrain:Ntrain + Ntest, :], y[Ntrain:Ntrain + Ntest]

######################################################################
Beispiel #10
0
    picture = Image.open(filepath)
    picture.save(file, "JPEG", optimize=True, quality=50)
    newsize = os.stat(os.path.join(os.getcwd(), file)).st_size
    percent = (oldsize - newsize) / float(oldsize) * 100
    if (verbose):
        print(file+" compressed from {0} to {1} or {2}%".format(oldsize, newsize, percent))

def tSNE(dataset):
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(dataset)
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
    return  tsne_results

current_directory = os.path.dirname(os.path.abspath(__file__))
mnist = fetch_openml(name="mnist_784")
# with old versions, fetch_mldata is faster when .mat file is available locally
# mnist = fetch_mldata("MNIST original")
X = mnist.data / 255.0
y = mnist.target
y = mnist['target'].astype(np.float)
print(X.shape, y.shape)

feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
df = pd.DataFrame(X,columns=feat_cols)
df['y'] = y
df['label'] = df['y'].apply(lambda i: str(i))
X, y = None, None
print('Size of the dataframe: {}'.format(df.shape))

# For reproducability of the results
from sklearn.metrics import accuracy_score
import matplotlib.patches as mpatches
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import random
import copy
# from sklearn.utils import resample (this is used for bootstrapping).
# from sklearn.model_selection import GridSearchCV (this is used to do grid search to find the optimal hyperparameters).

sns.set()

column_names = "DYRK1A_N	ITSN1_N	BDNF_N	NR1_N	NR2A_N	pAKT_N	pBRAF_N	pCAMKII_N	pCREB_N	pELK_N	pERK_N	pJNK_N	PKCA_N	pMEK_N	pNR1_N	pNR2A_N	pNR2B_N	pPKCAB_N	pRSK_N	AKT_N	BRAF_N	CAMKII_N	CREB_N	ELK_N	ERK_N	GSK3B_N	JNK_N	MEK_N	TRKA_N	RSK_N	APP_N	Bcatenin_N	SOD1_N	MTOR_N	P38_N	pMTOR_N	DSCR1_N	AMPKA_N	NR2B_N	pNUMB_N	RAPTOR_N	TIAM1_N	pP70S6_N	NUMB_N	P70S6_N	pGSK3B_N	pPKCG_N	CDK5_N	S6_N	ADARB1_N	AcetylH3K9_N	RRP1_N	BAX_N	ARC_N	ERBB4_N	nNOS_N	Tau_N	GFAP_N	GluR3_N	GluR4_N	IL1B_N	P3525_N	pCASP9_N	PSD95_N	SNCA_N	Ubiquitin_N	pGSK3B_Tyr216_N	SHH_N	BAD_N	BCL2_N	pS6_N	pCFOS_N	SYP_N	H3AcK18_N	EGR1_N	H3MeK4_N	CaNA_N"
column_names = column_names.split()
mice = fetch_openml(name='miceprotein', version=4)

parsing_targets = [0, 150, 300, 435, 570, 705, 840, 945, len(mice.target)]
parsing_groups = [
    'c-CS-m', 'c-SC-m', 'c-CS-s', 'c-SC-s', 't-CS-m', 't-SC-m', 't-CS-s',
    't-SC-s'
]


def Analysis_Choice(parsing_targets, parsing_groups):
    group_name = input('Enter group code: ')
    i = parsing_groups.index(group_name)
    return i


def Combined_Data(parsing_targets, parsing_groups, mice, column_names):
Beispiel #12
0
@author: prbpedro
"""
import numpy
import pandas
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import seaborn
import matplotlib.pyplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, validation_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

bunch = fetch_openml('mnist_784', version=1)

pd_dataframe = pandas.DataFrame(data=numpy.c_[bunch['data'], bunch['target']],
                                columns=bunch['feature_names'] + ['target'])

pd_dataframe.info()
pd_dataframe.isnull().any().describe()

seaborn.set()
seaborn.countplot(x="target", data=pd_dataframe)
matplotlib.pyplot.show()

pd_data_dataframe = pd_dataframe.loc[:, pd_dataframe.columns != 'target']
sample_digit = pd_data_dataframe.iloc[2000]
sample_digit_image = sample_digit.values.reshape(28, 28).astype((numpy.float))
matplotlib.pyplot.imshow(sample_digit_image,
Beispiel #13
0
    #DH = DataHolder("C:/Users/Pontus/Desktop/Dippa/dataset")
    #subs = list(DH.subjects.keys())
    #D,i = DH.subjects.get(subs[3]).getDataAndInfoForSubject()

    #dataset = SmallDataset(D,i)
    #dataset.setAsPairwise()
    #print(len(dataset))
    #x1,x2,y  = dataset[0:10]
    #print(y)
    #%%
    #print(np.shape(D))
    #print(D[0,0,0,0])
    #print(np.expand_dims(D,1).shape)
    #print(type(i))
    #print(type(pd.core.frame.DataFrame()))
    X_d, y_d = fetch_openml('mnist_784', version=1, return_X_y=True)
    #%%
    randInd = np.random.choice(np.arange(0, 69000, 1), (1000))
    print(randInd)
    #%%
    X = X_d[randInd, :]
    y = y_d[randInd]

    #print(np.max(X))
    #print(X.shape," ", y.shape)
    X = np.reshape(X, (1000, 28, 28, 1))

    dataset2 = SmallDataset(X, y)

    #%%
    dataset2.setAsTriplet()
Beispiel #14
0
 def __init__(self, dataset_name='cifar_10_small'):
     self.data = fetch_openml(name=dataset_name)
Beispiel #15
0
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                               expected_observations, expected_features,
                               expected_missing, expected_data_dtype,
                               expected_target_dtype, expect_sparse,
                               compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name,
                                   version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    # Please note that cache=False is crucial, as the monkey patched files are
    # not consistent with reality
    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id,
                              cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
        assert data_by_id.target_names == [target_column]
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
        assert data_by_id.target_names == target_column
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, str)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(
            data_by_id.data)) == expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml,
                         data_id=data_id,
                         cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id
def main():
    mnist = fetch_openml(name='mnist_784')
    echantillon = np.random.randint(70000, size=5000)
    data = mnist.data[echantillon]
    target = mnist.target[echantillon]

    xtrain, xtest, ytrain, ytest = train_test_split(data,
                                                    target,
                                                    train_size=0.7)

    mlp = neural_network.MLPClassifier(hidden_layer_sizes=(50))
    mlp.fit(xtrain, ytrain)
    score = mlp.score(xtest, ytest)
    print(f"Score avec mlp.score : {score}")

    # Classe de l’image 4 et sa classe prédite.
    print(mnist.target[4])
    print(mlp.predict(mnist.data[4].reshape(
        (1, -1))))  # Reshape de notre jeu de donnée en 2D

    # Calcul de précision avec la package metrics.precision_score
    ypredTest = mlp.predict(xtest)
    precision = metrics.precision_score(ytest, ypredTest, average='micro')
    print(f"Score avec la fonction precision_score : {precision}")

    # Varier le nombre de couches de 1 entre (2 et 100) couches
    _50neuron_layer = []
    print("Variation du nombre de couches de 2 à 100 : ")
    for nb_layer in range(2, 101):
        mlp = neural_network.MLPClassifier(
            hidden_layer_sizes=tuple([50 for i in range(nb_layer)]))

        start_training = time.time()
        mlp.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = mlp.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        _50neuron_layer.append(
            (nb_layer, final_training, final_prediction, error))
        print(f"\t {_50neuron_layer[-1]}")

    _50neuron_layer_list = list(zip(*_50neuron_layer))

    plot_fig(_50neuron_layer_list)

    create_neural_network(tuple(range(60, 10, -1)), xtrain, ytrain, xtest,
                          ytest, "50 couches de 60 à 11 nerones")
    create_neural_network(
        tuple(list(range(60, 32, -3)) + list(range(31, 12, -2))), xtrain,
        ytrain, xtest, ytest, "50 couches -3 puis -2 nerones")
    create_neural_network((14, 36, 64), xtrain, ytrain, xtest, ytest,
                          "3 couches 14 36 64 nerones")
    create_neural_network(
        (14, 36, 64, 112, 176, 204, 226, 283), xtrain, ytrain, xtest, ytest,
        "8 couches 14, 36, 64, 112, 176, 204, 226, 283 nerones")
    create_neural_network((64, 92, 117, 208, 117, 92, 64), xtrain, ytrain,
                          xtest, ytest,
                          "7 couches 64, 92, 117, 208, 117, 92, 64 nerones")

    solving = []
    print("Modification du solver : ")
    for solver in ['lbfgs', 'sgd', 'adam']:
        mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117,
                                                               208, 117, 92,
                                                               64),
                                           solver=solver)

        start_training = time.time()
        mlp.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = mlp.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        solving.append((solver, final_training, final_prediction, error))
        print(f"\t {solving[-1]}")

    solving_list = list(zip(*solving))

    plot_fig(solving_list)

    activ = []
    print("Variations de l'activation : ")
    for activation in ['identity', 'logistic', 'tanh', 'relu']:
        mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117,
                                                               208, 117, 92,
                                                               64),
                                           activation=activation)

        start_training = time.time()
        mlp.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = mlp.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        activ.append((activation, final_training, final_prediction, error))
        print(f"\t {activ[-1]}")

    activ_liste = list(zip(*activ))

    plot_fig(activ_liste)

    regul = []
    print("Evolution de la régularisation : ")
    for regularisation in np.arange(0.0001, 0.01, 0.001):
        mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117,
                                                               208, 117, 92,
                                                               64),
                                           alpha=regularisation)

        start_training = time.time()
        mlp.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = mlp.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        regul.append((regularisation, final_training, final_prediction, error))
        print(f"\t {regul[-1]}")

    regul_liste = list(zip(*regul))

    plot_fig(regul_liste)

    best_layer = (64, 92, 117, 208, 117, 92, 64)
    best_solver = "adam"
    best_activation = "relu"
    best_regularisation = 0.008

    best_mlp = neural_network.MLPClassifier(hidden_layer_sizes=best_layer,
                                            solver=best_solver,
                                            activation=best_activation,
                                            alpha=best_regularisation)
    start_training = time.time()
    best_mlp.fit(xtrain, ytrain)
    best_final_entrainement = time.time() - start_training

    start_prediction = time.time()
    ypred = best_mlp.predict(xtest)
    best_final_prediction = time.time() - start_prediction

    cross_val = model_selection.cross_val_score(best_mlp, data, target, cv=10)
    best_error = 1 - np.mean(cross_val)

    print(f"Durée de l'entraînement : {best_final_entrainement}")
    print(f"Durée de la prédiction : {best_final_prediction}")
    print(f"Erreur : {best_error}")

    cm = confusion_matrix(ytest, ypred)
    df_cm = pd.DataFrame(cm, columns=np.unique(ytest), index=np.unique(ytest))
    df_cm.index.name = 'Valeur réelle'
    df_cm.columns.name = 'Valeur prédite'
    plt.figure(figsize=(16, 9))
    sn.heatmap(df_cm, cmap="Blues", annot=True)
    plt.show()
'''
Created on Mar 18, 2020

@author: alexk101
'''
from __future__ import print_function
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

mnist = fetch_openml("mnist_784", version=1)
output = open(r"TSNE_2D.txt","w")
standardized_data = StandardScaler().fit_transform(mnist.data)

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=1000)
tsne_results = tsne.fit_transform(standardized_data)

output.write("70000 2 \n")
np.savetxt("TSNE_2D.txt",tsne_results)
output.close()
print("Test Complete")
Beispiel #18
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn import manifold

# %matplotlib inline
data = datasets.fetch_openml("mnist_784", version=1, return_X_y=True)
pixel_values, targets = data
targets = targets.astype(int)
# %%
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# %%
# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# %%
# Use ``ColumnTransformer`` by selecting column by names
#
# We will train our classifier with the following features:
#
# Numeric Features:
#
# * ``age``: float;
# * ``fare``: float.
#
def fetch_bank_marketing(*,
                         cache=True,
                         data_home=None,
                         as_frame=False,
                         return_X_y=False):
    """Load the UCI bank marketing dataset (binary classification).

    Download it if necessary.

    ==============   ====================
    Samples total                   45211
    Dimensionality                     17
    Features         numeric, categorical
    Classes                             2
    ==============   ====================

    Source: UCI Repository [3]_ Paper: Moro et al., 2014 [4]_

    The data is related with direct marketing campaigns of a Portuguese
    banking institution. The marketing campaigns were based on phone calls.
    Often, more than one contact to the same client was required,
    in order to access if the product (bank term deposit) would be (or not) subscribed.

    The classification goal is to predict if the client will subscribe a
    term deposit (variable y).


    Parameters
    ----------
    cache : bool, default=True
        Whether to cache downloaded datasets using joblib.

    data_home : str, default=None
        Specify another download and cache folder for the datasets.
        By default, all data is stored in '~/.fairlearn-data'
        subfolders.

    as_frame : bool, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric, string or categorical). The target is
        a pandas DataFrame or Series depending on the number of target_columns.
        The Bunch will contain a ``frame`` attribute with the target and the
        data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
        DataFrames or Series as describe above.

    return_X_y : bool, default=False
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

    Returns
    -------
    dataset : :obj:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray, shape (45211, 17)
            Each row corresponding to the 17 feature values in order.
            If ``as_frame`` is True, ``data`` is a pandas object.
        target : numpy array of shape (45211,)
            Each value represents whether the client subscribed a
            term deposit which is 'yes' if the client subscribed and 'no'
            otherwise.
            If ``as_frame`` is True, ``target`` is a pandas object.
        feature_names : list of length 17
            Array of ordered feature names used in the dataset.
        DESCR : string
            Description of the UCI bank marketing dataset.

    (data, target) : tuple of (numpy.ndarray, numpy.ndarray) or (pandas.DataFrame, pandas.Series)
        if ``return_X_y`` is True and ``as_frame`` is False

    (data, target) : tuple of (pandas.DataFrame, pandas.Series)
        if ``return_X_y`` is True and ``as_frame`` is True

    References
    ----------
    .. [3] S. Moro, P. Cortez, and P. Rita, UCI Machine Learning Repository:
       Bank Marketing Data Set, 14-Feb-2014. [Online]. Available:
       https://archive.ics.uci.edu/ml/datasets/Bank+Marketing.

    .. [4] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict
       the Success of Bank Telemarketing. Decision Support Systems,
       Elsevier, 62:22-31, June 2014

    """
    if not data_home:
        data_home = pathlib.Path().home() / _DOWNLOAD_DIRECTORY_NAME

    return fetch_openml(
        data_id=1461,
        data_home=data_home,
        cache=cache,
        as_frame=as_frame,
        return_X_y=return_X_y,
    )
from sklearn import datasets
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras.models import Sequential
import keras

mnist = fetch_openml(
    'mnist_784',
    version=1,
)
#(X_train,Y_train),(X_test, Y_test) = mnist.load_data()
n = len(mnist.data)
N = 10000
indices = np.random.permutation(range(n))[:N]
#print(indices)
X = mnist.data[indices]
y = mnist.target[indices]
Y = np.eye(10)[y.astype(int)]
print(X, Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)
'''
モデル設定
'''
n_in = len(X[0])
n_hidden = 2000
n_out = len(Y[0])

model = Sequential()
Beispiel #22
0
def _fetch_dataset_from_openml(data_id, data_name, data_version,
                               target_column,
                               expected_observations, expected_features,
                               expected_missing,
                               expected_data_dtype, expected_target_dtype,
                               expect_sparse, compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name, version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id, cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, string_types)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(data_by_id.data)) ==
                expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id
features in the dataset. Therefore the first layer weight matrix have the shape
(784, hidden_layer_sizes[0]).  We can therefore visualize a single column of
the weight matrix as a 28x28 pixel image.

To make the example run faster, we use very few hidden units, and train only
for a very short time. Training longer would result in weights with a much
smoother spatial appearance.
"""
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.neural_network import MLPClassifier

print(__doc__)

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

# rescale the data, use the traditional train/test split
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)

mlp.fit(X_train, y_train)
print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))
Beispiel #24
0
# Author: Adam Kleczewski
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

print(__doc__)

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml('yeast', version=4, return_X_y=True)
Y = Y == 'TRUE'
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')

# Fit an ensemble of logistic regression classifier chains and take the
Beispiel #25
0
"""

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold

from celer import LassoCV
from celer.plot_utils import configure_plt

print(__doc__)
configure_plt()

print("Loading data...")
dataset = fetch_openml("leukemia")
X = np.asfortranarray(dataset.data.astype(float))
y = 2 * ((dataset.target == "AML") - 0.5)

kf = KFold(shuffle=True, n_splits=3, random_state=0)
model = LassoCV(cv=kf, n_jobs=3)
model.fit(X, y)

print("Estimated regularization parameter alpha: %s" % model.alpha_)

###############################################################################
# Display results

plt.figure(figsize=(7, 3.5), constrained_layout=True)
plt.semilogx(model.alphas_, model.mse_path_, ':')
plt.semilogx(model.alphas_,
Beispiel #26
0
from sklearn.externals import joblib
from sklearn import datasets
from skimage.feature import hog
from sklearn.svm import LinearSVC
import numpy as np
from collections import Counter

receivedData = datasets.fetch_openml('mnist_784')

features = np.array(receivedData.data, 'int16')
labels = np.array(receivedData.target, 'int')

list_hog_fd = []

for feature in features:
    fd = hog(feature.reshape((28, 28)),
             orientations=9,
             pixels_per_cell=(14, 14),
             cells_per_block=(1, 1),
             visualize=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')

print("Count of digits in receivedData: ", Counter(labels))

clf = LinearSVC()

clf.fit(hog_features, labels)

joblib.dump(clf, "digits_cls.pkl", compress=3)
Beispiel #27
0
    t0 = time.time()
    if fit_with_y:
        X_reduced = transformer.fit_transform(X, y)
    else:
        X_reduced = transformer.fit_transform(X)
    t1 = time.time()
    reducer_name = "+".join([
        type(step[1]).__name__ for step in transformer.get_params()['steps']
    ]) if is_pipeline else type(transformer).__name__
    print("{} took {:.1f}s (on {} MNIST images)".format(
        reducer_name, t1 - t0, len(X)))
    plot_digits(X_reduced, y, images=X, figsize=(35, 25))
    plt.show()


mnist = fetch_openml("mnist_784")

X = mnist['data']
y = mnist['target']

random_indices = np.random.permutation(60000)

X = X[random_indices]
y = y[random_indices]

plot_2_dims(PCA(n_components=2, random_state=42), X[:2000], y[:2000])  # 0.1s

plot_2_dims(LocallyLinearEmbedding(n_components=2, random_state=42), X[:2000],
            y[:2000])  # 12.6s

plot_2_dims(MDS(n_components=2, random_state=42), X[:2000], y[:2000])  # 365.3s
Beispiel #28
0
def fetch_creditcard(*,
                     cache=True,
                     data_home=None,
                     as_frame=False,
                     return_X_y=False):
    """
    Load the creditcard dataset. Download it if necessary.

    Note that internally this is using `fetch_openml` from scikit-learn, which is experimental.

    ==============   ==============
    Samples total            284807
    Dimensionality               29
    Features                   real
    Target                 int 0, 1
    ==============   ==============

    The datasets contains transactions made by credit cards in September 2013 by european
    cardholders. This dataset present transactions that occurred in two days, where we have
    492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive
    class (frauds) account for 0.172% of all transactions.

    Please cite:
        Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi.
        Calibrating Probability with Undersampling for Unbalanced Classification.
        In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

    :param version: integer or 'active', default='active'
        Version of the dataset. Can only be provided if also ``name`` is given.
        If 'active' the oldest version that's still active is used. Since
        there may be more than one active version of a dataset, and those
        versions may fundamentally be different from one another, setting an
        exact version is highly recommended.
    :param cache: boolean, default=True
        Whether to cache downloaded datasets using joblib.
    :param data_home: optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    :param as_frame: boolean, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric, string or categorical). The target is
        a pandas DataFrame or Series depending on the number of target_columns.
        The Bunch will contain a ``frame`` attribute with the target and the
        data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
        DataFrames or Series as describe above.
    :param return_X_y: : boolean, default=False.
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

    Returns
    -------
    :return:
        Dictionary-like object, with the following attributes.

         * data
            ndarray, shape (284807, 29) if ``as_frame`` is True, ``data`` is a pandas object.
         * target
            ndarray, shape (284807, ) if ``as_frame`` is True, ``target`` is a pandas object.
         * feature_names
            Array of ordered feature names used in the dataset.
         * DESCR
            Description of the creditcard dataset. Best to use print.

    Notes
    -----
    This dataset consists of 284807 samples and 29 features.
    """
    return fetch_openml(
        data_id=1597,
        data_home=data_home,
        cache=cache,
        as_frame=as_frame,
        return_X_y=return_X_y,
    )
Beispiel #29
0
  category support <categorical_support_gbdt>` of the
  :class:`~ensemble.HistGradientBoostingRegressor` estimator.

We will work with the Ames Lowa Housing dataset which consists of numerical
and categorical features, where the houses' sales prices is the target.

"""

# %%
# Load Ames Housing dataset
# -------------------------
# First, we load the Ames Housing data as a pandas dataframe. The features
# are either categorical or numerical:
from sklearn.datasets import fetch_openml

X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)

# Select only a subset of features of X to make the example faster to run
categorical_columns_subset = [
    "BldgType",
    "GarageFinish",
    "LotConfig",
    "Functional",
    "MasVnrType",
    "HouseStyle",
    "FireplaceQu",
    "ExterCond",
    "ExterQual",
    "PoolQC",
]
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.cluster import KMeans
import numpy as np
data_dir = 'data'
mnist = fetch_openml('mnist_784', version=1, data_home=data_dir, as_frame=False)

print("Shape of mnist", mnist.data.shape)
k = 10
N = 10000

X = mnist.data[np.random.choice(mnist.data.shape[0], N)]
kmeans = KMeans(n_clusters=k).fit(X)
predict = kmeans.predict(X)
#change some of the libraries
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#import pandas as pd
import numpy as np
from pathlib import Path
#import matplotlib.pyplot as plt

datadir = Path('B-1')
mnist = fetch_openml('mnist_784')

print(mnist.data.shape)
#print(mnist.COL_NAMES)
print(mnist.target.shape)
print(np.unique(mnist.target))

#change this part so that it is our training data
# test_size: what proportion of original data is used for test set
train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data,
                                                            mnist.target,
                                                            test_size=1 / 7.0,
                                                            random_state=122)

scaler = StandardScaler()
# Fit on training set only.
#!/usr/bin/env python
# coding: utf-8

# In[1]:

## 데이터 불러오기

import sys, os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')  # sklearn 이용해서 데이터 불러오기
"""
mnist key 설명
data : 7만개의 28*28인 이미지를 자동으로 784의 1차원으로 저장된 데이터, type : float64, shape : (70000,784)
target : data의 label(0~9), type : object, shape : (70000,)
"""

# In[2]:

mnist_x = mnist.data  # X
mnist_y = mnist.target  # Y
mnist_y = mnist_y.astype("int32")  # 문자열로 저장되어 있는 것을 int 형식으로 변경

# In[3]:

# 전체 데이터에서 test data로 사용할 데이터의 index를 랜덤으로 10000개 추출
np.random.seed(seed=50)
test_idx = np.random.choice(mnist_x.shape[0], 10000, replace=False)

# In[4]:
# Author: Adam Kleczewski
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

print(__doc__)

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml("yeast", version=4, return_X_y=True)
Y = Y == "TRUE"
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average="samples")

# Fit an ensemble of logistic regression classifier chains and take the
# %%
# Load Data and train model
# -------------------------
# For this example, we load a blood transfusion service center data set from
# `OpenML <https://www.openml.org/d/1464>`. This is a binary classification
# problem where the target is whether an individual donated blood. Then the
# data is split into a train and test dataset and a logistic regression is
# fitted wtih the train dataset.
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = fetch_openml(data_id=1464, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
clf.fit(X_train, y_train)

# %%
# Create :class:`ConfusionMatrixDisplay`
##############################################################################
# With the fitted model, we compute the predictions of the model on the test
# dataset. These predictions are used to compute the confustion matrix which
# is plotted with the :class:`ConfusionMatrixDisplay`
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = clf.predict(X_test)
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

plt.figure()
for dataset_name in datasets:
    # loading and vectorization
    print('loading data')
    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
                                 random_state=random_state)
        X = dataset.data
        y = dataset.target

    if dataset_name == 'shuttle':
        dataset = fetch_openml('shuttle')
        X = dataset.data
        y = dataset.target
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)

    if dataset_name == 'forestcover':
        dataset = fetch_covtype()
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4
import matplotlib.pyplot as plt
from deslib.dcs import MCB
from deslib.dcs import OLA
from deslib.dcs import Rank
from deslib.des import DESP
from deslib.des import KNORAE
from deslib.des import KNORAU
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_openml

rng = np.random.RandomState(123456)

data = fetch_openml(name='diabetes', cache=False)
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

# Normalizing the dataset to have 0 mean and unit variance.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training a pool of classifiers using the bagging technique.
pool_classifiers = BaggingClassifier(DecisionTreeClassifier(random_state=rng),
                                     random_state=rng)
pool_classifiers.fit(X_train, y_train)

###############################################################################
#          Olivier Grisel <*****@*****.**>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##############################################################################
# The French Motor Third-Party Liability Claims dataset
# -----------------------------------------------------
#
# Let's load the motor claim dataset from OpenML:
# https://www.openml.org/d/41214

from sklearn.datasets import fetch_openml

df = fetch_openml(data_id=41214, as_frame=True).frame
df

# %%
# The number of claims (``ClaimNb``) is a positive integer that can be modeled
# as a Poisson distribution. It is then assumed to be the number of discrete
# events occurring with a constant rate in a given time interval (``Exposure``,
# in units of years).
#
# Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally
# on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as
# ``sample_weight``.

df["Frequency"] = df["ClaimNb"] / df["Exposure"]

print("Average Frequency = {}".format(
from neural_network.neuralnetwork import NeuralNetwork
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt


print("[INFO] loading MNIST (sample) dataset...")
dataset = fetch_openml('mnist_784')
data = dataset.data.astype("float") / 255.0

(trainX, testX, trainY, testY) = train_test_split(data, dataset.target, test_size = 0.25)
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)

model = Sequential()
model.add(Dense(256, input_shape = (784, ), activation="sigmoid" ))
model.add(Dense(128, activation="sigmoid"))
model.add(Dense(10, activation="softmax"))

print("[INFO] training network...")
sgd = SGD(0.01)
model.compile(loss="categorical_crossentropy", optimizer='sgd', metrics=["accuracy"])
H = model.fit(trainX, trainY, validation_data = (testX, testY), epochs = 100, batch_size =128)
# Author: Adam Kleczewski
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_similarity_score
from sklearn.linear_model import LogisticRegression

print(__doc__)

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml('yeast', version=4, return_X_y=True)
Y = Y == 'TRUE'
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)

# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i)
          for i in range(10)]
Beispiel #40
0
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate
import functools
import sklearn.metrics as skm
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# %%
# Next, we import the data:

data = fetch_openml(data_id=1590, as_frame=True)
X_raw = data.data
Y = (data.target == '>50K') * 1

# %%
# For purposes of clarity, we consolidate the 'race' column to have
# three unique values:


def race_transform(input_str):
    """Reduce values to White, Black and Other."""
    result = 'Other'
    if input_str == 'White' or input_str == 'Black':
        result = input_str
    return result
Beispiel #41
0
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
seed = 0
np.random.seed(seed)
import tensorflow as tf
tf.random.set_seed(seed)
import os

data = fetch_openml('hls4ml_lhc_jets_hlf')
X, y = data['data'], data['target']

le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y, 5)
X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1
from qkeras.qlayers import QDense, QActivation
from qkeras.quantizers import quantized_bits, quantized_relu
from callbacks import all_callbacks

model = Sequential()
Beispiel #42
0
    plt.ylim([0, 1])

def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

mnist = fetch_openml('MNIST_784')
X = mnist['data']
y = mnist['target']

## visualise digit(s)
target_index = 11
some_digit = X[target_index]
plt.imshow(some_digit.reshape(28,28),cmap=matplotlib.cm.binary )
plt.axis('off')
plt.title( 'target = ' + y[target_index] )
plt.show


## training/test split
X_train, X_test, y_train, y_test = X[:60000], X[:10000], y[:60000], y[:10000]
randomise_set = np.random.permutation(60000)