def load_credit_data(): sk_data_dir = get_data_home() archive = RemoteFileMetadata( filename='default of credit card clients.xls', url='https://archive.ics.uci.edu/ml/machine-learning-databases/' '00350/default%20of%20credit%20card%20clients.xls', checksum=('30c6be3abd8dcfd3e6096c828bad8c2f' '011238620f5369220bd60cfc82700933')) if not exists(join(sk_data_dir, archive.filename)): _fetch_remote(archive, dirname=sk_data_dir) data = pd.read_excel(join(sk_data_dir, archive.filename), sheet_name='Data', header=1) dataset = Bunch(data=(data.drop('default payment next month', axis=1)), target=np.array(data['default payment next month'])) return dataset
from sklearn.utils import Bunch, check_random_state from skimage.io import imread from skimage.transform import resize as imresize, estimate_transform import Polygon from .poly_utils import isSelfIntersecting # CONSTANTS ################################################################################ ################################################################################ ARCHIVE_BASE_URL = 'https://github.com/jchazalon/smartdoc15-ch1-dataset/releases/download/v2.0.0' ARCHIVE_MODELS_FILENAME = 'models.tar.gz' ARCHIVE_MODELS = RemoteFileMetadata( filename=ARCHIVE_MODELS_FILENAME, url=ARCHIVE_BASE_URL + '/' + ARCHIVE_MODELS_FILENAME, checksum=( '6f9068624073f76b20f88352b2bac60b9e5de5a59819fc9db37fba1ee07cce8a')) ARCHIVE_FRAMES_FILENAME = 'frames.tar.gz' ARCHIVE_FRAMES = RemoteFileMetadata( filename=ARCHIVE_FRAMES_FILENAME, url=ARCHIVE_BASE_URL + '/' + ARCHIVE_FRAMES_FILENAME, checksum=( '3acb8be143fc86c507d90d298097cba762e91a3abf7e2d35ccd5303e13a79eae')) DATASET_CONTENT = { "models": (ARCHIVE_MODELS, "390MB", "Model images"), "frames": (ARCHIVE_FRAMES, "972MB", "Dataset content and metadata") }
@author: David Diaz Vico @license: MIT """ import os from scipy.io import loadmat from sklearn.datasets.base import (_fetch_remote, get_data_home, Bunch, RemoteFileMetadata) DATASETS = { 'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart', 'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm', 'waveform' } ARCHIVE = RemoteFileMetadata( filename='benchmarks.mat', url= 'https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets/raw/master/benchmarks.mat', checksum=( '47c19e4bc4716edc4077cfa5ea61edf4d02af4ec51a0ecfe035626ae8b561c75')) def fetch_raetsch(name, data_home=None): """Fetch Gunnar Raetsch's dataset. Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are 'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart', 'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and 'waveform'. More info at https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets. Parameters ----------
from os.path import dirname, exists, join from os import makedirs, remove import numpy as np from scipy.io.matlab import loadmat from sklearn.datasets.base import get_data_home from sklearn.datasets.base import _fetch_remote from sklearn.datasets.base import RemoteFileMetadata from sklearn.datasets.base import _pkl_filepath import joblib as _joblib from sklearn.utils import check_random_state, Bunch GLASS = RemoteFileMetadata( filename='glass.data', url= 'https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', checksum=('dd67373f4baf2807345df02cbfef2093' 'd342e61ad0d82a4fb79af43ef8ce449d')) def fetch_uci_glass_outlier(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI glass data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 6 Samples total 214 Dimensionality 9 Features real
from os.path import exists, join from os import makedirs, remove import tarfile import arff import pandas as pd import numpy as np from sklearn.datasets.base import RemoteFileMetadata from sklearn.datasets.base import get_data_home, _fetch_remote Lymphography = RemoteFileMetadata( filename = 'Lymphography.tar.gz', url='http://www.dbs.ifi.lmu.de/research/outlier-evaluation/input/Lymphography.tar.gz', checksum='1ecb8fc1cc86960bbbe604d8fbf058f01bf4035af1165cc32470f9dced77a8f8' ) def fetch_or_load_lymphography(): global Lymphography data_home = get_data_home() if not exists(data_home): makedirs(data_home) file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff') if not exists(file_path): data_archive_path = _fetch_remote(Lymphography) tf = tarfile.open(data_archive_path) tf.extractall(data_home) remove(data_archive_path) f_descriptor = open(file_path, 'r') dataset = arff.load(f_descriptor) df = pd.DataFrame(dataset['data']) feature = df.iloc[:,1:19].to_numpy() ground_truth = np.ones(148) for i in [43, 44, 45, 103, 132, 147]:
from os.path import dirname, exists, join from os import makedirs, remove import numpy as np from sklearn.datasets.base import get_data_home from sklearn.datasets.base import _fetch_remote from sklearn.datasets.base import RemoteFileMetadata from sklearn.datasets.base import _pkl_filepath from sklearn.utils import _joblib from sklearn.utils import check_random_state, Bunch LIBRAS = RemoteFileMetadata( filename='libras.data', url= 'https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data', checksum=('97ebdaa6a9b28ab4a2cdd84b14f19a95' 'a7456a46137c362b65a0669eca3c3c4d')) def fetch_uci_libras(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI libra data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 15 Samples total 360 Dimensionality 90
from sklearn.datasets.base import _pkl_filepath from sklearn.datasets.base import _fetch_remote from sklearn.datasets.base import RemoteFileMetadata from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import normalize from sklearn.utils import deprecated from sklearn.utils import _joblib from sklearn.utils import check_random_state, Bunch logger = logging.getLogger(__name__) # The original data can be found at: # https://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz ARCHIVE = RemoteFileMetadata( filename='20news-bydate.tar.gz', url='https://ndownloader.figshare.com/files/5975967', checksum=('8f1b2514ca22a5ade8fbb9cfa5727df9' '5fa587f4c87b786e15c759fa66d95610')) CACHE_NAME = "20news-bydate.pkz" TRAIN_FOLDER = "20news-bydate-train" TEST_FOLDER = "20news-bydate-test" @deprecated("Function 'download_20newsgroups' was renamed to " "'_download_20newsgroups' in version 0.20 and will be removed in " "release 0.22.") def download_20newsgroups(target_dir, cache_path): return _download_20newsgroups(target_dir, cache_path)