def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [cat[idx] if 0 <= idx < len(cat) else None for idx in data_bunch.data[:, col_idx].astype(int)] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) data_downloaded = np.array(data_arff['data'], dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))
def test_fetch_openml_notarget(monkeypatch, gzip_response): data_id = 61 target_column = None expected_observations = 150 expected_features = 5 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) data = fetch_openml(data_id=data_id, target_column=target_column, cache=False) assert data.data.shape == (expected_observations, expected_features) assert data.target is None
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
def load_mnist(n_samples=None, class_0='0', class_1='8'): """Load MNIST, select two classes, shuffle and return only n_samples.""" # Load data from http://openml.org/d/554 mnist = fetch_openml('mnist_784', version=1) # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42) if n_samples is not None: X, y = X[:n_samples], y[:n_samples] return X, y
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features X /= 255 return X, y
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity/10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name).data return X
def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test
def load_mauna_loa_atmospheric_co2(): ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] y = ml_data.data[:, 0] m = ml_data.data[:, 1] month_float = y + (m - 1) / 12 ppmvs = ml_data.target for month, ppmv in zip(month_float, ppmvs): if not months or month != months[-1]: months.append(month) ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) avg_ppmvs = np.asarray(ppmv_sums) / counts return months, avg_ppmvs
from pykeops.torch import LazyTensor use_cuda = torch.cuda.is_available() print(use_cuda) tensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor ###################################################################### # Load the MNIST dataset: 70,000 images of shape (28,28). try: from sklearn.datasets import fetch_openml except ImportError: raise ImportError("This tutorial requires Scikit Learn version >= 0.20.") mnist = fetch_openml('mnist_784', cache=False) x = tensor(mnist.data.astype('float32')) print(x.shape()) y = tensor(mnist.target.astype('int64')) print(y.shape()) ###################################################################### # Split it into a train and test set: D = x.shape[1] Ntrain, Ntest = (60000, 10000) if use_cuda else (1000, 100) x_train, y_train = x[:Ntrain, :], y[:Ntrain] x_test, y_test = x[Ntrain:Ntrain + Ntest, :], y[Ntrain:Ntrain + Ntest] ######################################################################
picture = Image.open(filepath) picture.save(file, "JPEG", optimize=True, quality=50) newsize = os.stat(os.path.join(os.getcwd(), file)).st_size percent = (oldsize - newsize) / float(oldsize) * 100 if (verbose): print(file+" compressed from {0} to {1} or {2}%".format(oldsize, newsize, percent)) def tSNE(dataset): time_start = time.time() tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(dataset) print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start)) return tsne_results current_directory = os.path.dirname(os.path.abspath(__file__)) mnist = fetch_openml(name="mnist_784") # with old versions, fetch_mldata is faster when .mat file is available locally # mnist = fetch_mldata("MNIST original") X = mnist.data / 255.0 y = mnist.target y = mnist['target'].astype(np.float) print(X.shape, y.shape) feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ] df = pd.DataFrame(X,columns=feat_cols) df['y'] = y df['label'] = df['y'].apply(lambda i: str(i)) X, y = None, None print('Size of the dataframe: {}'.format(df.shape)) # For reproducability of the results
from sklearn.metrics import accuracy_score import matplotlib.patches as mpatches from sklearn.svm import SVC from sklearn.model_selection import validation_curve from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV import random import copy # from sklearn.utils import resample (this is used for bootstrapping). # from sklearn.model_selection import GridSearchCV (this is used to do grid search to find the optimal hyperparameters). sns.set() column_names = "DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N pPKCAB_N pRSK_N AKT_N BRAF_N CAMKII_N CREB_N ELK_N ERK_N GSK3B_N JNK_N MEK_N TRKA_N RSK_N APP_N Bcatenin_N SOD1_N MTOR_N P38_N pMTOR_N DSCR1_N AMPKA_N NR2B_N pNUMB_N RAPTOR_N TIAM1_N pP70S6_N NUMB_N P70S6_N pGSK3B_N pPKCG_N CDK5_N S6_N ADARB1_N AcetylH3K9_N RRP1_N BAX_N ARC_N ERBB4_N nNOS_N Tau_N GFAP_N GluR3_N GluR4_N IL1B_N P3525_N pCASP9_N PSD95_N SNCA_N Ubiquitin_N pGSK3B_Tyr216_N SHH_N BAD_N BCL2_N pS6_N pCFOS_N SYP_N H3AcK18_N EGR1_N H3MeK4_N CaNA_N" column_names = column_names.split() mice = fetch_openml(name='miceprotein', version=4) parsing_targets = [0, 150, 300, 435, 570, 705, 840, 945, len(mice.target)] parsing_groups = [ 'c-CS-m', 'c-SC-m', 'c-CS-s', 'c-SC-s', 't-CS-m', 't-SC-m', 't-CS-s', 't-SC-s' ] def Analysis_Choice(parsing_targets, parsing_groups): group_name = input('Enter group code: ') i = parsing_groups.index(group_name) return i def Combined_Data(parsing_targets, parsing_groups, mice, column_names):
@author: prbpedro """ import numpy import pandas from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split import seaborn import matplotlib.pyplot from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score, validation_curve from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from mlxtend.plotting import plot_confusion_matrix bunch = fetch_openml('mnist_784', version=1) pd_dataframe = pandas.DataFrame(data=numpy.c_[bunch['data'], bunch['target']], columns=bunch['feature_names'] + ['target']) pd_dataframe.info() pd_dataframe.isnull().any().describe() seaborn.set() seaborn.countplot(x="target", data=pd_dataframe) matplotlib.pyplot.show() pd_data_dataframe = pd_dataframe.loc[:, pd_dataframe.columns != 'target'] sample_digit = pd_data_dataframe.iloc[2000] sample_digit_image = sample_digit.values.reshape(28, 28).astype((numpy.float)) matplotlib.pyplot.imshow(sample_digit_image,
#DH = DataHolder("C:/Users/Pontus/Desktop/Dippa/dataset") #subs = list(DH.subjects.keys()) #D,i = DH.subjects.get(subs[3]).getDataAndInfoForSubject() #dataset = SmallDataset(D,i) #dataset.setAsPairwise() #print(len(dataset)) #x1,x2,y = dataset[0:10] #print(y) #%% #print(np.shape(D)) #print(D[0,0,0,0]) #print(np.expand_dims(D,1).shape) #print(type(i)) #print(type(pd.core.frame.DataFrame())) X_d, y_d = fetch_openml('mnist_784', version=1, return_X_y=True) #%% randInd = np.random.choice(np.arange(0, 69000, 1), (1000)) print(randInd) #%% X = X_d[randInd, :] y = y_d[randInd] #print(np.max(X)) #print(X.shape," ", y.shape) X = np.reshape(X, (1000, 28, 28, 1)) dataset2 = SmallDataset(X, y) #%% dataset2.setAsTriplet()
def __init__(self, dataset_name='cifar_10_small'): self.data = fetch_openml(name=dataset_name)
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) assert data_by_id.target_names == [target_column] elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.target_names == target_column assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, str) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan( data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
def main(): mnist = fetch_openml(name='mnist_784') echantillon = np.random.randint(70000, size=5000) data = mnist.data[echantillon] target = mnist.target[echantillon] xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.7) mlp = neural_network.MLPClassifier(hidden_layer_sizes=(50)) mlp.fit(xtrain, ytrain) score = mlp.score(xtest, ytest) print(f"Score avec mlp.score : {score}") # Classe de l’image 4 et sa classe prédite. print(mnist.target[4]) print(mlp.predict(mnist.data[4].reshape( (1, -1)))) # Reshape de notre jeu de donnée en 2D # Calcul de précision avec la package metrics.precision_score ypredTest = mlp.predict(xtest) precision = metrics.precision_score(ytest, ypredTest, average='micro') print(f"Score avec la fonction precision_score : {precision}") # Varier le nombre de couches de 1 entre (2 et 100) couches _50neuron_layer = [] print("Variation du nombre de couches de 2 à 100 : ") for nb_layer in range(2, 101): mlp = neural_network.MLPClassifier( hidden_layer_sizes=tuple([50 for i in range(nb_layer)])) start_training = time.time() mlp.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = mlp.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) _50neuron_layer.append( (nb_layer, final_training, final_prediction, error)) print(f"\t {_50neuron_layer[-1]}") _50neuron_layer_list = list(zip(*_50neuron_layer)) plot_fig(_50neuron_layer_list) create_neural_network(tuple(range(60, 10, -1)), xtrain, ytrain, xtest, ytest, "50 couches de 60 à 11 nerones") create_neural_network( tuple(list(range(60, 32, -3)) + list(range(31, 12, -2))), xtrain, ytrain, xtest, ytest, "50 couches -3 puis -2 nerones") create_neural_network((14, 36, 64), xtrain, ytrain, xtest, ytest, "3 couches 14 36 64 nerones") create_neural_network( (14, 36, 64, 112, 176, 204, 226, 283), xtrain, ytrain, xtest, ytest, "8 couches 14, 36, 64, 112, 176, 204, 226, 283 nerones") create_neural_network((64, 92, 117, 208, 117, 92, 64), xtrain, ytrain, xtest, ytest, "7 couches 64, 92, 117, 208, 117, 92, 64 nerones") solving = [] print("Modification du solver : ") for solver in ['lbfgs', 'sgd', 'adam']: mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117, 208, 117, 92, 64), solver=solver) start_training = time.time() mlp.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = mlp.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) solving.append((solver, final_training, final_prediction, error)) print(f"\t {solving[-1]}") solving_list = list(zip(*solving)) plot_fig(solving_list) activ = [] print("Variations de l'activation : ") for activation in ['identity', 'logistic', 'tanh', 'relu']: mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117, 208, 117, 92, 64), activation=activation) start_training = time.time() mlp.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = mlp.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) activ.append((activation, final_training, final_prediction, error)) print(f"\t {activ[-1]}") activ_liste = list(zip(*activ)) plot_fig(activ_liste) regul = [] print("Evolution de la régularisation : ") for regularisation in np.arange(0.0001, 0.01, 0.001): mlp = neural_network.MLPClassifier(hidden_layer_sizes=(64, 92, 117, 208, 117, 92, 64), alpha=regularisation) start_training = time.time() mlp.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = mlp.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) regul.append((regularisation, final_training, final_prediction, error)) print(f"\t {regul[-1]}") regul_liste = list(zip(*regul)) plot_fig(regul_liste) best_layer = (64, 92, 117, 208, 117, 92, 64) best_solver = "adam" best_activation = "relu" best_regularisation = 0.008 best_mlp = neural_network.MLPClassifier(hidden_layer_sizes=best_layer, solver=best_solver, activation=best_activation, alpha=best_regularisation) start_training = time.time() best_mlp.fit(xtrain, ytrain) best_final_entrainement = time.time() - start_training start_prediction = time.time() ypred = best_mlp.predict(xtest) best_final_prediction = time.time() - start_prediction cross_val = model_selection.cross_val_score(best_mlp, data, target, cv=10) best_error = 1 - np.mean(cross_val) print(f"Durée de l'entraînement : {best_final_entrainement}") print(f"Durée de la prédiction : {best_final_prediction}") print(f"Erreur : {best_error}") cm = confusion_matrix(ytest, ypred) df_cm = pd.DataFrame(cm, columns=np.unique(ytest), index=np.unique(ytest)) df_cm.index.name = 'Valeur réelle' df_cm.columns.name = 'Valeur prédite' plt.figure(figsize=(16, 9)) sn.heatmap(df_cm, cmap="Blues", annot=True) plt.show()
''' Created on Mar 18, 2020 @author: alexk101 ''' from __future__ import print_function import numpy as np from sklearn.datasets import fetch_openml from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler mnist = fetch_openml("mnist_784", version=1) output = open(r"TSNE_2D.txt","w") standardized_data = StandardScaler().fit_transform(mnist.data) tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=1000) tsne_results = tsne.fit_transform(standardized_data) output.write("70000 2 \n") np.savetxt("TSNE_2D.txt",tsne_results) output.close() print("Test Complete")
import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn import datasets from sklearn import manifold # %matplotlib inline data = datasets.fetch_openml("mnist_784", version=1, return_X_y=True) pixel_values, targets = data targets = targets.astype(int)
# %% import numpy as np from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_openml from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, GridSearchCV np.random.seed(0) # %% # Load data from https://www.openml.org/d/40945 X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) # Alternatively X and y can be obtained directly from the frame attribute: # X = titanic.frame.drop('survived', axis=1) # y = titanic.frame['survived'] # %% # Use ``ColumnTransformer`` by selecting column by names # # We will train our classifier with the following features: # # Numeric Features: # # * ``age``: float; # * ``fare``: float. #
def fetch_bank_marketing(*, cache=True, data_home=None, as_frame=False, return_X_y=False): """Load the UCI bank marketing dataset (binary classification). Download it if necessary. ============== ==================== Samples total 45211 Dimensionality 17 Features numeric, categorical Classes 2 ============== ==================== Source: UCI Repository [3]_ Paper: Moro et al., 2014 [4]_ The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (or not) subscribed. The classification goal is to predict if the client will subscribe a term deposit (variable y). Parameters ---------- cache : bool, default=True Whether to cache downloaded datasets using joblib. data_home : str, default=None Specify another download and cache folder for the datasets. By default, all data is stored in '~/.fairlearn-data' subfolders. as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric, string or categorical). The target is a pandas DataFrame or Series depending on the number of target_columns. The Bunch will contain a ``frame`` attribute with the target and the data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas DataFrames or Series as describe above. return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. Returns ------- dataset : :obj:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray, shape (45211, 17) Each row corresponding to the 17 feature values in order. If ``as_frame`` is True, ``data`` is a pandas object. target : numpy array of shape (45211,) Each value represents whether the client subscribed a term deposit which is 'yes' if the client subscribed and 'no' otherwise. If ``as_frame`` is True, ``target`` is a pandas object. feature_names : list of length 17 Array of ordered feature names used in the dataset. DESCR : string Description of the UCI bank marketing dataset. (data, target) : tuple of (numpy.ndarray, numpy.ndarray) or (pandas.DataFrame, pandas.Series) if ``return_X_y`` is True and ``as_frame`` is False (data, target) : tuple of (pandas.DataFrame, pandas.Series) if ``return_X_y`` is True and ``as_frame`` is True References ---------- .. [3] S. Moro, P. Cortez, and P. Rita, UCI Machine Learning Repository: Bank Marketing Data Set, 14-Feb-2014. [Online]. Available: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing. .. [4] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014 """ if not data_home: data_home = pathlib.Path().home() / _DOWNLOAD_DIRECTORY_NAME return fetch_openml( data_id=1461, data_home=data_home, cache=cache, as_frame=as_frame, return_X_y=return_X_y, )
from sklearn import datasets import numpy as np from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from keras.layers import Dense, Activation from keras.optimizers import SGD from keras.models import Sequential import keras mnist = fetch_openml( 'mnist_784', version=1, ) #(X_train,Y_train),(X_test, Y_test) = mnist.load_data() n = len(mnist.data) N = 10000 indices = np.random.permutation(range(n))[:N] #print(indices) X = mnist.data[indices] y = mnist.target[indices] Y = np.eye(10)[y.astype(int)] print(X, Y) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8) ''' モデル設定 ''' n_in = len(X[0]) n_hidden = 2000 n_out = len(Y[0]) model = Sequential()
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, string_types) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
features in the dataset. Therefore the first layer weight matrix have the shape (784, hidden_layer_sizes[0]). We can therefore visualize a single column of the weight matrix as a 28x28 pixel image. To make the example run faster, we use very few hidden units, and train only for a very short time. Training longer would result in weights with a much smoother spatial appearance. """ import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.neural_network import MLPClassifier print(__doc__) # Load data from https://www.openml.org/d/554 X, y = fetch_openml('mnist_784', version=1, return_X_y=True) # rescale the data, use the traditional train/test split X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] # mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, # solver='sgd', verbose=10, tol=1e-4, random_state=1) mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1) mlp.fit(X_train, y_train) print("Training set score: %f" % mlp.score(X_train, y_train)) print("Test set score: %f" % mlp.score(X_test, y_test))
# Author: Adam Kleczewski # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.multioutput import ClassifierChain from sklearn.model_selection import train_test_split from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import jaccard_score from sklearn.linear_model import LogisticRegression print(__doc__) # Load a multi-label dataset from https://www.openml.org/d/40597 X, Y = fetch_openml('yeast', version=4, return_X_y=True) Y = Y == 'TRUE' X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) # Fit an independent logistic regression model for each class using the # OneVsRestClassifier wrapper. base_lr = LogisticRegression() ovr = OneVsRestClassifier(base_lr) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples') # Fit an ensemble of logistic regression classifier chains and take the
""" import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.model_selection import KFold from celer import LassoCV from celer.plot_utils import configure_plt print(__doc__) configure_plt() print("Loading data...") dataset = fetch_openml("leukemia") X = np.asfortranarray(dataset.data.astype(float)) y = 2 * ((dataset.target == "AML") - 0.5) kf = KFold(shuffle=True, n_splits=3, random_state=0) model = LassoCV(cv=kf, n_jobs=3) model.fit(X, y) print("Estimated regularization parameter alpha: %s" % model.alpha_) ############################################################################### # Display results plt.figure(figsize=(7, 3.5), constrained_layout=True) plt.semilogx(model.alphas_, model.mse_path_, ':') plt.semilogx(model.alphas_,
from sklearn.externals import joblib from sklearn import datasets from skimage.feature import hog from sklearn.svm import LinearSVC import numpy as np from collections import Counter receivedData = datasets.fetch_openml('mnist_784') features = np.array(receivedData.data, 'int16') labels = np.array(receivedData.target, 'int') list_hog_fd = [] for feature in features: fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualize=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') print("Count of digits in receivedData: ", Counter(labels)) clf = LinearSVC() clf.fit(hog_features, labels) joblib.dump(clf, "digits_cls.pkl", compress=3)
t0 = time.time() if fit_with_y: X_reduced = transformer.fit_transform(X, y) else: X_reduced = transformer.fit_transform(X) t1 = time.time() reducer_name = "+".join([ type(step[1]).__name__ for step in transformer.get_params()['steps'] ]) if is_pipeline else type(transformer).__name__ print("{} took {:.1f}s (on {} MNIST images)".format( reducer_name, t1 - t0, len(X))) plot_digits(X_reduced, y, images=X, figsize=(35, 25)) plt.show() mnist = fetch_openml("mnist_784") X = mnist['data'] y = mnist['target'] random_indices = np.random.permutation(60000) X = X[random_indices] y = y[random_indices] plot_2_dims(PCA(n_components=2, random_state=42), X[:2000], y[:2000]) # 0.1s plot_2_dims(LocallyLinearEmbedding(n_components=2, random_state=42), X[:2000], y[:2000]) # 12.6s plot_2_dims(MDS(n_components=2, random_state=42), X[:2000], y[:2000]) # 365.3s
def fetch_creditcard(*, cache=True, data_home=None, as_frame=False, return_X_y=False): """ Load the creditcard dataset. Download it if necessary. Note that internally this is using `fetch_openml` from scikit-learn, which is experimental. ============== ============== Samples total 284807 Dimensionality 29 Features real Target int 0, 1 ============== ============== The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset present transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. Please cite: Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015 :param version: integer or 'active', default='active' Version of the dataset. Can only be provided if also ``name`` is given. If 'active' the oldest version that's still active is used. Since there may be more than one active version of a dataset, and those versions may fundamentally be different from one another, setting an exact version is highly recommended. :param cache: boolean, default=True Whether to cache downloaded datasets using joblib. :param data_home: optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. :param as_frame: boolean, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric, string or categorical). The target is a pandas DataFrame or Series depending on the number of target_columns. The Bunch will contain a ``frame`` attribute with the target and the data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas DataFrames or Series as describe above. :param return_X_y: : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. Returns ------- :return: Dictionary-like object, with the following attributes. * data ndarray, shape (284807, 29) if ``as_frame`` is True, ``data`` is a pandas object. * target ndarray, shape (284807, ) if ``as_frame`` is True, ``target`` is a pandas object. * feature_names Array of ordered feature names used in the dataset. * DESCR Description of the creditcard dataset. Best to use print. Notes ----- This dataset consists of 284807 samples and 29 features. """ return fetch_openml( data_id=1597, data_home=data_home, cache=cache, as_frame=as_frame, return_X_y=return_X_y, )
category support <categorical_support_gbdt>` of the :class:`~ensemble.HistGradientBoostingRegressor` estimator. We will work with the Ames Lowa Housing dataset which consists of numerical and categorical features, where the houses' sales prices is the target. """ # %% # Load Ames Housing dataset # ------------------------- # First, we load the Ames Housing data as a pandas dataframe. The features # are either categorical or numerical: from sklearn.datasets import fetch_openml X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True) # Select only a subset of features of X to make the example faster to run categorical_columns_subset = [ "BldgType", "GarageFinish", "LotConfig", "Functional", "MasVnrType", "HouseStyle", "FireplaceQu", "ExterCond", "ExterQual", "PoolQC", ]
import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.cluster import KMeans import numpy as np data_dir = 'data' mnist = fetch_openml('mnist_784', version=1, data_home=data_dir, as_frame=False) print("Shape of mnist", mnist.data.shape) k = 10 N = 10000 X = mnist.data[np.random.choice(mnist.data.shape[0], N)] kmeans = KMeans(n_clusters=k).fit(X) predict = kmeans.predict(X)
#change some of the libraries import sklearn from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression #import pandas as pd import numpy as np from pathlib import Path #import matplotlib.pyplot as plt datadir = Path('B-1') mnist = fetch_openml('mnist_784') print(mnist.data.shape) #print(mnist.COL_NAMES) print(mnist.target.shape) print(np.unique(mnist.target)) #change this part so that it is our training data # test_size: what proportion of original data is used for test set train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data, mnist.target, test_size=1 / 7.0, random_state=122) scaler = StandardScaler() # Fit on training set only.
#!/usr/bin/env python # coding: utf-8 # In[1]: ## 데이터 불러오기 import sys, os import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml mnist = fetch_openml('mnist_784') # sklearn 이용해서 데이터 불러오기 """ mnist key 설명 data : 7만개의 28*28인 이미지를 자동으로 784의 1차원으로 저장된 데이터, type : float64, shape : (70000,784) target : data의 label(0~9), type : object, shape : (70000,) """ # In[2]: mnist_x = mnist.data # X mnist_y = mnist.target # Y mnist_y = mnist_y.astype("int32") # 문자열로 저장되어 있는 것을 int 형식으로 변경 # In[3]: # 전체 데이터에서 test data로 사용할 데이터의 index를 랜덤으로 10000개 추출 np.random.seed(seed=50) test_idx = np.random.choice(mnist_x.shape[0], 10000, replace=False) # In[4]:
# Author: Adam Kleczewski # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.multioutput import ClassifierChain from sklearn.model_selection import train_test_split from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import jaccard_score from sklearn.linear_model import LogisticRegression print(__doc__) # Load a multi-label dataset from https://www.openml.org/d/40597 X, Y = fetch_openml("yeast", version=4, return_X_y=True) Y = Y == "TRUE" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Fit an independent logistic regression model for each class using the # OneVsRestClassifier wrapper. base_lr = LogisticRegression() ovr = OneVsRestClassifier(base_lr) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average="samples") # Fit an ensemble of logistic regression classifier chains and take the
# %% # Load Data and train model # ------------------------- # For this example, we load a blood transfusion service center data set from # `OpenML <https://www.openml.org/d/1464>`. This is a binary classification # problem where the target is whether an individual donated blood. Then the # data is split into a train and test dataset and a logistic regression is # fitted wtih the train dataset. from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split X, y = fetch_openml(data_id=1464, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0)) clf.fit(X_train, y_train) # %% # Create :class:`ConfusionMatrixDisplay` ############################################################################## # With the fitted model, we compute the predictions of the model on the test # dataset. These predictions are used to compute the confustion matrix which # is plotted with the :class:`ConfusionMatrixDisplay` from sklearn.metrics import confusion_matrix from sklearn.metrics import ConfusionMatrixDisplay y_pred = clf.predict(X_test)
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] plt.figure() for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=random_state) X = dataset.data y = dataset.target if dataset_name == 'shuttle': dataset = fetch_openml('shuttle') X = dataset.data y = dataset.target # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) if dataset_name == 'forestcover': dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4
import matplotlib.pyplot as plt from deslib.dcs import MCB from deslib.dcs import OLA from deslib.dcs import Rank from deslib.des import DESP from deslib.des import KNORAE from deslib.des import KNORAU from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import StandardScaler from sklearn.datasets import fetch_openml rng = np.random.RandomState(123456) data = fetch_openml(name='diabetes', cache=False) X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) # Normalizing the dataset to have 0 mean and unit variance. scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Training a pool of classifiers using the bagging technique. pool_classifiers = BaggingClassifier(DecisionTreeClassifier(random_state=rng), random_state=rng) pool_classifiers.fit(X_train, y_train) ###############################################################################
# Olivier Grisel <*****@*****.**> # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt import pandas as pd ############################################################################## # The French Motor Third-Party Liability Claims dataset # ----------------------------------------------------- # # Let's load the motor claim dataset from OpenML: # https://www.openml.org/d/41214 from sklearn.datasets import fetch_openml df = fetch_openml(data_id=41214, as_frame=True).frame df # %% # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval (``Exposure``, # in units of years). # # Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally # on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as # ``sample_weight``. df["Frequency"] = df["ClaimNb"] / df["Exposure"] print("Average Frequency = {}".format(
from neural_network.neuralnetwork import NeuralNetwork from sklearn.preprocessing import LabelBinarizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn import datasets from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD from sklearn.datasets import fetch_openml import matplotlib.pyplot as plt print("[INFO] loading MNIST (sample) dataset...") dataset = fetch_openml('mnist_784') data = dataset.data.astype("float") / 255.0 (trainX, testX, trainY, testY) = train_test_split(data, dataset.target, test_size = 0.25) lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.transform(testY) model = Sequential() model.add(Dense(256, input_shape = (784, ), activation="sigmoid" )) model.add(Dense(128, activation="sigmoid")) model.add(Dense(10, activation="softmax")) print("[INFO] training network...") sgd = SGD(0.01) model.compile(loss="categorical_crossentropy", optimizer='sgd', metrics=["accuracy"]) H = model.fit(trainX, trainY, validation_data = (testX, testY), epochs = 100, batch_size =128)
# Author: Adam Kleczewski # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from sklearn.multioutput import ClassifierChain from sklearn.model_selection import train_test_split from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import jaccard_similarity_score from sklearn.linear_model import LogisticRegression print(__doc__) # Load a multi-label dataset from https://www.openml.org/d/40597 X, Y = fetch_openml('yeast', version=4, return_X_y=True) Y = Y == 'TRUE' X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) # Fit an independent logistic regression model for each class using the # OneVsRestClassifier wrapper. ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr) # Fit an ensemble of logistic regression classifier chains and take the # take the average prediction of all the chains. chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i) for i in range(10)]
from fairlearn.metrics import MetricFrame from fairlearn.metrics import selection_rate import functools import sklearn.metrics as skm import numpy as np import pandas as pd from sklearn.datasets import fetch_openml from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler # %% # Next, we import the data: data = fetch_openml(data_id=1590, as_frame=True) X_raw = data.data Y = (data.target == '>50K') * 1 # %% # For purposes of clarity, we consolidate the 'race' column to have # three unique values: def race_transform(input_str): """Reduce values to White, Black and Other.""" result = 'Other' if input_str == 'White' or input_str == 'Black': result = input_str return result
from tensorflow.keras.utils import to_categorical from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler import numpy as np seed = 0 np.random.seed(seed) import tensorflow as tf tf.random.set_seed(seed) import os data = fetch_openml('hls4ml_lhc_jets_hlf') X, y = data['data'], data['target'] le = LabelEncoder() y = le.fit_transform(y) y = to_categorical(y, 5) X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42) from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Activation from tensorflow.keras.optimizers import Adam from tensorflow.keras.regularizers import l1 from qkeras.qlayers import QDense, QActivation from qkeras.quantizers import quantized_bits, quantized_relu from callbacks import all_callbacks model = Sequential()
plt.ylim([0, 1]) def plot_precision_vs_recall(precisions, recalls): plt.plot(recalls, precisions, "b-", linewidth=2) plt.xlabel("Recall", fontsize=16) plt.ylabel("Precision", fontsize=16) plt.axis([0, 1, 0, 1]) def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) mnist = fetch_openml('MNIST_784') X = mnist['data'] y = mnist['target'] ## visualise digit(s) target_index = 11 some_digit = X[target_index] plt.imshow(some_digit.reshape(28,28),cmap=matplotlib.cm.binary ) plt.axis('off') plt.title( 'target = ' + y[target_index] ) plt.show ## training/test split X_train, X_test, y_train, y_test = X[:60000], X[:10000], y[:60000], y[:10000] randomise_set = np.random.permutation(60000)