Example #1
0
def load_credit_data():
    sk_data_dir = get_data_home()
    archive = RemoteFileMetadata(
        filename='default of credit card clients.xls',
        url='https://archive.ics.uci.edu/ml/machine-learning-databases/'
        '00350/default%20of%20credit%20card%20clients.xls',
        checksum=('30c6be3abd8dcfd3e6096c828bad8c2f'
                  '011238620f5369220bd60cfc82700933'))

    if not exists(join(sk_data_dir, archive.filename)):
        _fetch_remote(archive, dirname=sk_data_dir)

    data = pd.read_excel(join(sk_data_dir, archive.filename),
                         sheet_name='Data',
                         header=1)

    dataset = Bunch(data=(data.drop('default payment next month', axis=1)),
                    target=np.array(data['default payment next month']))
    return dataset
def fetch_or_load_lymphography():
    global Lymphography
    data_home = get_data_home()
    if not exists(data_home):
        makedirs(data_home)
    file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff')
    if not exists(file_path):
        data_archive_path = _fetch_remote(Lymphography)
        tf = tarfile.open(data_archive_path)
        tf.extractall(data_home)
        remove(data_archive_path)
    f_descriptor = open(file_path, 'r')
    dataset = arff.load(f_descriptor)
    df = pd.DataFrame(dataset['data'])
    feature = df.iloc[:,1:19].to_numpy()
    ground_truth = np.ones(148)
    for i in [43, 44, 45, 103, 132, 147]:
        ground_truth[i] = -1 
    return (feature, ground_truth)
Example #3
0
def fetch_raetsch(name, data_home=None):
    """Fetch Gunnar Raetsch's dataset.

    Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are
    'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart',
    'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and
    'waveform'. More info at
    https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets.

    Parameters
    ----------
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    """
    if name not in DATASETS:
        raise Exception('Avaliable datasets are ' + str(list(DATASETS)))
    dirname = os.path.join(get_data_home(data_home=data_home), 'raetsch')
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    filename = _fetch_remote(ARCHIVE, dirname=dirname)
    X, y, train_splits, test_splits = loadmat(filename)[name][0][0]
    cv = ((X[tr - 1], y[tr - 1], X[ts - 1], y[ts - 1])
          for tr, ts in zip(train_splits, test_splits))
    return Bunch(data=X,
                 target=y,
                 data_test=None,
                 target_test=None,
                 inner_cv=None,
                 outer_cv=cv,
                 DESCR=name)
Example #4
0
def _download_20newsgroups(target_dir, cache_path):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url)
    archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
    archive_path = 'C:/Users/Jackie/scikit_learn_data/20news_home/20news-bydate.tar.gz'
    logger.debug("Decompressing %s", archive_path)
    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
    # os.remove(archive_path)# do not remove the file package

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)

    # shutil.rmtree(target_dir) #do not remove all the source files and directories
    return cache
Example #5
0
def __download_open_dataset(data_home=None, download_if_missing=True):
    """Helper function to download any missing SD15-CH1 data.

    The dataset will be stored like this:
        ${data_home}/smartdoc15-ch1_home/frames:
        ├── background01
        │   ├── datasheet001
        │   │   ├── frame_0001.jpeg
        │   │   ├── [...]
        │   │   └── frame_0235.jpeg
        │   ├── [...]
        │   └── tax005
        │       └── [...]
        ├── background02
        |   └── [...]
        ├── background03
        |   └── [...]
        ├── background04
        |   └── [...]
        ├── background05
        |   └── [...]
        └── metadata.csv.gz

        ${data_home}/smartdoc15-ch1_home/models:
        ├── 01-original
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 02-edited
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 03-captured-nexus
        │   ├── datasheet001.jpg # JPG images here
        │   ├── [...]
        │   └── tax005.jpg
        ├── 04-corrected-nexus
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 05-corrected-nexus-scaled33
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── correct_perspective.m
        └── original_datasets_files.txt
    """
    data_home = get_data_home(data_home=data_home)
    sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME)

    if not os.path.exists(sd15ch1_home):
        os.makedirs(sd15ch1_home)

    data_dirs = {}
    for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT):
        data_folder_path = os.path.join(sd15ch1_home, subdir)
        data_dirs[subdir] = data_folder_path

        if not os.path.exists(data_folder_path):
            archive_path = os.path.join(sd15ch1_home, archive.filename)
            # (later) FIXME this is a naive test for existing files
            if not os.path.exists(archive_path):
                if download_if_missing:
                    __info("Downloading file %s (%s): %s" %
                           (archive.filename, size, archive.url))
                    _fetch_remote(archive, dirname=sd15ch1_home)
                else:
                    __err("%s is missing" % archive_path, IOError)

            __info("Decompressing the data archive to %s" %
                   (data_folder_path, ))
            tarfile.open(archive_path,
                         "r:gz").extractall(path=data_folder_path)
            os.remove(archive_path)

    return data_dirs
Example #6
0
def fetch_uci_glass_outlier(data_home=None,
                            shuffle=False,
                            random_state=0,
                            download_if_missing=True):
    """Load the UCI glass data-set from AT&T (classification).
    Download it if necessary.
    =================   =====================
    Classes                                6
    Samples total                         214
    Dimensionality                       9
    Features                            real
    =================   =====================
    
    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    shuffle : boolean, optional
        If True the order of the dataset is shuffled to avoid having
        images of the same person grouped.
    random_state : int, RandomState instance or None (default=0)
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
    Returns
    -------    
    data : numpy array of shape (214, 9)
        Each row corresponds to a glass feature of 9 dimension
    target : numpy array of shape (214, )
        Labels associated to each glas. Those labels are from
        [1,2,3,5,6,7] and correspond to the Subject IDs.
    """
    global GLASS
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    filepath = _pkl_filepath(data_home, 'uci_glass_outlier.pkz')
    if not exists(filepath):
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

        print('downloading UCI GLASS from %s to %s' % (GLASS.url, data_home))
        data_path = _fetch_remote(GLASS, dirname=data_home)

        glass = np.genfromtxt(data_path, delimiter=",")
        # the class 6 (minority) as outlier and all other classes as inliers
        glass[:, -1] = 2 * (glass[:, -1] != 6) - 1
        _joblib.dump(glass, filepath, compress=6)
        remove(data_path)

    else:
        glass = _joblib.load(filepath)

    feature = glass[:, 1:-1]
    target = glass[:, -1]
    if shuffle:
        random_state = check_random_state(random_state)
        order = random_state.permutation(len(glass))
        feature = glass[order]
        target = target[order]

    return (feature, target)
def _ensure_dataset_is_downloaded(sd15ch1_home, download_if_missing):
    """Helper function to download any missing SD15-CH1 data.

    The dataset will be stored like this:
        ${data_home}/smartdoc15-ch1_home/frames:
        ├── background01
        │   ├── datasheet001
        │   │   ├── frame_0001.jpeg
        │   │   ├── [...]
        │   │   └── frame_0235.jpeg
        │   ├── [...]
        │   └── tax005
        │       └── [...]
        ├── background02
        |   └── [...]
        ├── background03
        |   └── [...]
        ├── background04
        |   └── [...]
        ├── background05
        |   └── [...]
        └── metadata.csv.gz

        ${data_home}/smartdoc15-ch1_home/models:
        ├── 01-original
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 02-edited
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 03-captured-nexus
        │   ├── datasheet001.jpg # JPG images here
        │   ├── [...]
        │   └── tax005.jpg
        ├── 04-corrected-nexus
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 05-corrected-nexus-scaled33
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── correct_perspective.m
        └── original_datasets_files.txt
    """

    if not os.path.exists(sd15ch1_home):
        os.makedirs(sd15ch1_home)

    data_dirs = {}
    for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT):
        data_folder_path = os.path.join(sd15ch1_home, subdir)
        data_dirs[subdir] = data_folder_path

        # The existence of the target directory indicates a complete installation
        install_successful_filename = os.path.join(data_folder_path,
                                                   "_INSTALL_SUCCESSFUL_")
        if not os.path.exists(install_successful_filename):
            archive_path = os.path.join(sd15ch1_home, archive.filename)
            # FIXME we should check the sum of the archive file
            if not os.path.exists(archive_path):
                if download_if_missing:
                    print("Downloading file %s (%s): %s" %
                          (archive.filename, size, archive.url))
                    _fetch_remote(archive, dirname=sd15ch1_home)
                else:
                    msg = "%s is missing" % archive_path
                    print(msg)
                    raise IOError(msg)

            print("Decompressing the data archive to %s" %
                  (data_folder_path, ))
            tarfile.open(archive_path,
                         "r:gz").extractall(path=data_folder_path)
            os.remove(archive_path)
            # Touch indicator file
            with open(install_successful_filename, 'a') as f:
                f.write("OK\n")