def load_credit_data(): sk_data_dir = get_data_home() archive = RemoteFileMetadata( filename='default of credit card clients.xls', url='https://archive.ics.uci.edu/ml/machine-learning-databases/' '00350/default%20of%20credit%20card%20clients.xls', checksum=('30c6be3abd8dcfd3e6096c828bad8c2f' '011238620f5369220bd60cfc82700933')) if not exists(join(sk_data_dir, archive.filename)): _fetch_remote(archive, dirname=sk_data_dir) data = pd.read_excel(join(sk_data_dir, archive.filename), sheet_name='Data', header=1) dataset = Bunch(data=(data.drop('default payment next month', axis=1)), target=np.array(data['default payment next month'])) return dataset
def fetch_or_load_lymphography(): global Lymphography data_home = get_data_home() if not exists(data_home): makedirs(data_home) file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff') if not exists(file_path): data_archive_path = _fetch_remote(Lymphography) tf = tarfile.open(data_archive_path) tf.extractall(data_home) remove(data_archive_path) f_descriptor = open(file_path, 'r') dataset = arff.load(f_descriptor) df = pd.DataFrame(dataset['data']) feature = df.iloc[:,1:19].to_numpy() ground_truth = np.ones(148) for i in [43, 44, 45, 103, 132, 147]: ground_truth[i] = -1 return (feature, ground_truth)
def fetch_raetsch(name, data_home=None): """Fetch Gunnar Raetsch's dataset. Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are 'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart', 'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and 'waveform'. More info at https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets. Parameters ---------- name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. Returns ------- data : Bunch Dictionary-like object with all the data and metadata. """ if name not in DATASETS: raise Exception('Avaliable datasets are ' + str(list(DATASETS))) dirname = os.path.join(get_data_home(data_home=data_home), 'raetsch') if not os.path.exists(dirname): os.makedirs(dirname) filename = _fetch_remote(ARCHIVE, dirname=dirname) X, y, train_splits, test_splits = loadmat(filename)[name][0][0] cv = ((X[tr - 1], y[tr - 1], X[ts - 1], y[ts - 1]) for tr, ts in zip(train_splits, test_splits)) return Bunch(data=X, target=y, data_test=None, target_test=None, inner_cv=None, outer_cv=cv, DESCR=name)
def _download_20newsgroups(target_dir, cache_path): """Download the 20 newsgroups data and stored it as a zipped pickle.""" train_path = os.path.join(target_dir, TRAIN_FOLDER) test_path = os.path.join(target_dir, TEST_FOLDER) if not os.path.exists(target_dir): os.makedirs(target_dir) # logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url) archive_path = _fetch_remote(ARCHIVE, dirname=target_dir) archive_path = 'C:/Users/Jackie/scikit_learn_data/20news_home/20news-bydate.tar.gz' logger.debug("Decompressing %s", archive_path) tarfile.open(archive_path, "r:gz").extractall(path=target_dir) # os.remove(archive_path)# do not remove the file package # Store a zipped pickle cache = dict(train=load_files(train_path, encoding='latin1'), test=load_files(test_path, encoding='latin1')) compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(cache_path, 'wb') as f: f.write(compressed_content) # shutil.rmtree(target_dir) #do not remove all the source files and directories return cache
def __download_open_dataset(data_home=None, download_if_missing=True): """Helper function to download any missing SD15-CH1 data. The dataset will be stored like this: ${data_home}/smartdoc15-ch1_home/frames: ├── background01 │ ├── datasheet001 │ │ ├── frame_0001.jpeg │ │ ├── [...] │ │ └── frame_0235.jpeg │ ├── [...] │ └── tax005 │ └── [...] ├── background02 | └── [...] ├── background03 | └── [...] ├── background04 | └── [...] ├── background05 | └── [...] └── metadata.csv.gz ${data_home}/smartdoc15-ch1_home/models: ├── 01-original │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 02-edited │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 03-captured-nexus │ ├── datasheet001.jpg # JPG images here │ ├── [...] │ └── tax005.jpg ├── 04-corrected-nexus │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 05-corrected-nexus-scaled33 │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── correct_perspective.m └── original_datasets_files.txt """ data_home = get_data_home(data_home=data_home) sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME) if not os.path.exists(sd15ch1_home): os.makedirs(sd15ch1_home) data_dirs = {} for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT): data_folder_path = os.path.join(sd15ch1_home, subdir) data_dirs[subdir] = data_folder_path if not os.path.exists(data_folder_path): archive_path = os.path.join(sd15ch1_home, archive.filename) # (later) FIXME this is a naive test for existing files if not os.path.exists(archive_path): if download_if_missing: __info("Downloading file %s (%s): %s" % (archive.filename, size, archive.url)) _fetch_remote(archive, dirname=sd15ch1_home) else: __err("%s is missing" % archive_path, IOError) __info("Decompressing the data archive to %s" % (data_folder_path, )) tarfile.open(archive_path, "r:gz").extractall(path=data_folder_path) os.remove(archive_path) return data_dirs
def fetch_uci_glass_outlier(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI glass data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 6 Samples total 214 Dimensionality 9 Features real ================= ===================== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. shuffle : boolean, optional If True the order of the dataset is shuffled to avoid having images of the same person grouped. random_state : int, RandomState instance or None (default=0) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : numpy array of shape (214, 9) Each row corresponds to a glass feature of 9 dimension target : numpy array of shape (214, ) Labels associated to each glas. Those labels are from [1,2,3,5,6,7] and correspond to the Subject IDs. """ global GLASS data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'uci_glass_outlier.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") print('downloading UCI GLASS from %s to %s' % (GLASS.url, data_home)) data_path = _fetch_remote(GLASS, dirname=data_home) glass = np.genfromtxt(data_path, delimiter=",") # the class 6 (minority) as outlier and all other classes as inliers glass[:, -1] = 2 * (glass[:, -1] != 6) - 1 _joblib.dump(glass, filepath, compress=6) remove(data_path) else: glass = _joblib.load(filepath) feature = glass[:, 1:-1] target = glass[:, -1] if shuffle: random_state = check_random_state(random_state) order = random_state.permutation(len(glass)) feature = glass[order] target = target[order] return (feature, target)
def _ensure_dataset_is_downloaded(sd15ch1_home, download_if_missing): """Helper function to download any missing SD15-CH1 data. The dataset will be stored like this: ${data_home}/smartdoc15-ch1_home/frames: ├── background01 │ ├── datasheet001 │ │ ├── frame_0001.jpeg │ │ ├── [...] │ │ └── frame_0235.jpeg │ ├── [...] │ └── tax005 │ └── [...] ├── background02 | └── [...] ├── background03 | └── [...] ├── background04 | └── [...] ├── background05 | └── [...] └── metadata.csv.gz ${data_home}/smartdoc15-ch1_home/models: ├── 01-original │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 02-edited │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 03-captured-nexus │ ├── datasheet001.jpg # JPG images here │ ├── [...] │ └── tax005.jpg ├── 04-corrected-nexus │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 05-corrected-nexus-scaled33 │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── correct_perspective.m └── original_datasets_files.txt """ if not os.path.exists(sd15ch1_home): os.makedirs(sd15ch1_home) data_dirs = {} for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT): data_folder_path = os.path.join(sd15ch1_home, subdir) data_dirs[subdir] = data_folder_path # The existence of the target directory indicates a complete installation install_successful_filename = os.path.join(data_folder_path, "_INSTALL_SUCCESSFUL_") if not os.path.exists(install_successful_filename): archive_path = os.path.join(sd15ch1_home, archive.filename) # FIXME we should check the sum of the archive file if not os.path.exists(archive_path): if download_if_missing: print("Downloading file %s (%s): %s" % (archive.filename, size, archive.url)) _fetch_remote(archive, dirname=sd15ch1_home) else: msg = "%s is missing" % archive_path print(msg) raise IOError(msg) print("Decompressing the data archive to %s" % (data_folder_path, )) tarfile.open(archive_path, "r:gz").extractall(path=data_folder_path) os.remove(archive_path) # Touch indicator file with open(install_successful_filename, 'a') as f: f.write("OK\n")