Beispiel #1
0
def load_credit_data():
    sk_data_dir = get_data_home()
    archive = RemoteFileMetadata(
        filename='default of credit card clients.xls',
        url='https://archive.ics.uci.edu/ml/machine-learning-databases/'
        '00350/default%20of%20credit%20card%20clients.xls',
        checksum=('30c6be3abd8dcfd3e6096c828bad8c2f'
                  '011238620f5369220bd60cfc82700933'))

    if not exists(join(sk_data_dir, archive.filename)):
        _fetch_remote(archive, dirname=sk_data_dir)

    data = pd.read_excel(join(sk_data_dir, archive.filename),
                         sheet_name='Data',
                         header=1)

    dataset = Bunch(data=(data.drop('default payment next month', axis=1)),
                    target=np.array(data['default payment next month']))
    return dataset
Beispiel #2
0
from sklearn.utils import Bunch, check_random_state
from skimage.io import imread
from skimage.transform import resize as imresize, estimate_transform
import Polygon

from .poly_utils import isSelfIntersecting

# CONSTANTS
################################################################################
################################################################################
ARCHIVE_BASE_URL = 'https://github.com/jchazalon/smartdoc15-ch1-dataset/releases/download/v2.0.0'

ARCHIVE_MODELS_FILENAME = 'models.tar.gz'
ARCHIVE_MODELS = RemoteFileMetadata(
    filename=ARCHIVE_MODELS_FILENAME,
    url=ARCHIVE_BASE_URL + '/' + ARCHIVE_MODELS_FILENAME,
    checksum=(
        '6f9068624073f76b20f88352b2bac60b9e5de5a59819fc9db37fba1ee07cce8a'))

ARCHIVE_FRAMES_FILENAME = 'frames.tar.gz'
ARCHIVE_FRAMES = RemoteFileMetadata(
    filename=ARCHIVE_FRAMES_FILENAME,
    url=ARCHIVE_BASE_URL + '/' + ARCHIVE_FRAMES_FILENAME,
    checksum=(
        '3acb8be143fc86c507d90d298097cba762e91a3abf7e2d35ccd5303e13a79eae'))

DATASET_CONTENT = {
    "models": (ARCHIVE_MODELS, "390MB", "Model images"),
    "frames": (ARCHIVE_FRAMES, "972MB", "Dataset content and metadata")
}
Beispiel #3
0
@author: David Diaz Vico
@license: MIT
"""

import os
from scipy.io import loadmat
from sklearn.datasets.base import (_fetch_remote, get_data_home, Bunch,
                                   RemoteFileMetadata)

DATASETS = {
    'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart',
    'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm', 'waveform'
}
ARCHIVE = RemoteFileMetadata(
    filename='benchmarks.mat',
    url=
    'https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets/raw/master/benchmarks.mat',
    checksum=(
        '47c19e4bc4716edc4077cfa5ea61edf4d02af4ec51a0ecfe035626ae8b561c75'))


def fetch_raetsch(name, data_home=None):
    """Fetch Gunnar Raetsch's dataset.

    Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are
    'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart',
    'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and
    'waveform'. More info at
    https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets.

    Parameters
    ----------
Beispiel #4
0
from os.path import dirname, exists, join
from os import makedirs, remove

import numpy as np
from scipy.io.matlab import loadmat

from sklearn.datasets.base import get_data_home
from sklearn.datasets.base import _fetch_remote
from sklearn.datasets.base import RemoteFileMetadata
from sklearn.datasets.base import _pkl_filepath
import joblib as _joblib
from sklearn.utils import check_random_state, Bunch

GLASS = RemoteFileMetadata(
    filename='glass.data',
    url=
    'https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data',
    checksum=('dd67373f4baf2807345df02cbfef2093'
              'd342e61ad0d82a4fb79af43ef8ce449d'))


def fetch_uci_glass_outlier(data_home=None,
                            shuffle=False,
                            random_state=0,
                            download_if_missing=True):
    """Load the UCI glass data-set from AT&T (classification).
    Download it if necessary.
    =================   =====================
    Classes                                6
    Samples total                         214
    Dimensionality                       9
    Features                            real
from os.path import exists, join
from os import makedirs, remove
import tarfile
import arff
import pandas as pd
import numpy as np
from sklearn.datasets.base import RemoteFileMetadata
from sklearn.datasets.base import get_data_home, _fetch_remote

Lymphography = RemoteFileMetadata(
    filename = 'Lymphography.tar.gz',
    url='http://www.dbs.ifi.lmu.de/research/outlier-evaluation/input/Lymphography.tar.gz',
    checksum='1ecb8fc1cc86960bbbe604d8fbf058f01bf4035af1165cc32470f9dced77a8f8'
)
def fetch_or_load_lymphography():
    global Lymphography
    data_home = get_data_home()
    if not exists(data_home):
        makedirs(data_home)
    file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff')
    if not exists(file_path):
        data_archive_path = _fetch_remote(Lymphography)
        tf = tarfile.open(data_archive_path)
        tf.extractall(data_home)
        remove(data_archive_path)
    f_descriptor = open(file_path, 'r')
    dataset = arff.load(f_descriptor)
    df = pd.DataFrame(dataset['data'])
    feature = df.iloc[:,1:19].to_numpy()
    ground_truth = np.ones(148)
    for i in [43, 44, 45, 103, 132, 147]:
from os.path import dirname, exists, join
from os import makedirs, remove

import numpy as np

from sklearn.datasets.base import get_data_home
from sklearn.datasets.base import _fetch_remote
from sklearn.datasets.base import RemoteFileMetadata
from sklearn.datasets.base import _pkl_filepath
from sklearn.utils import _joblib
from sklearn.utils import check_random_state, Bunch

LIBRAS = RemoteFileMetadata(
    filename='libras.data',
    url=
    'https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data',
    checksum=('97ebdaa6a9b28ab4a2cdd84b14f19a95'
              'a7456a46137c362b65a0669eca3c3c4d'))


def fetch_uci_libras(data_home=None,
                     shuffle=False,
                     random_state=0,
                     download_if_missing=True):
    """Load the UCI libra  data-set from AT&T (classification).
    Download it if necessary.

    =================   =====================
    Classes                                15
    Samples total                         360
    Dimensionality                       90
Beispiel #7
0
from sklearn.datasets.base import _pkl_filepath
from sklearn.datasets.base import _fetch_remote
from sklearn.datasets.base import RemoteFileMetadata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.utils import deprecated
from sklearn.utils import _joblib
from sklearn.utils import check_random_state, Bunch

logger = logging.getLogger(__name__)

# The original data can be found at:
# https://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
ARCHIVE = RemoteFileMetadata(
    filename='20news-bydate.tar.gz',
    url='https://ndownloader.figshare.com/files/5975967',
    checksum=('8f1b2514ca22a5ade8fbb9cfa5727df9'
              '5fa587f4c87b786e15c759fa66d95610'))

CACHE_NAME = "20news-bydate.pkz"

TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"


@deprecated("Function 'download_20newsgroups' was renamed to "
            "'_download_20newsgroups' in version 0.20 and will be removed in "
            "release 0.22.")
def download_20newsgroups(target_dir, cache_path):
    return _download_20newsgroups(target_dir, cache_path)