def fetch_tgz(dataname, urlname, data_home=None): """Fetch zipped dataset. Fetch a tgz file from a given url, unzips and stores it in a given directory. Parameters ---------- dataname: string Dataset name. urlname: string Dataset url. data_home: string, default=None Dataset directory. Returns ------- data_home: string Directory. """ # fetch file filename = fetch_file(dataname, urlname, data_home=data_home) data_home = get_data_home(data_home=data_home) data_home = join(data_home, dataname) # unzip file try: with tarfile.open(filename, 'r:gz') as tar_file: tar_file.extractall(data_home) except Exception: remove(filename) raise return data_home
def fetch_libsvm(collection, name, data_home=None): """Fetch LIBSVM dataset. Fetch a LIBSVM dataset by collection and name. More info at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets. Parameters ---------- collection : string Collection name. name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. Returns ------- data : Bunch Dictionary-like object with all the data and metadata. """ if collection not in COLLECTIONS: raise Exception('Avaliable collections are ' + str(list(COLLECTIONS))) dirname = os.path.join(get_data_home(data_home=data_home), 'libsvm', collection, name.replace('/', '-')) if not os.path.exists(dirname): os.makedirs(dirname) X, y, X_test, y_test, cv, X_remaining, y_remaining = _load(collection, name, dirname=dirname) data = Bunch(data=X, target=y, data_test=X_test, target_test=y_test, inner_cv=cv, outer_cv=None, data_remaining=X_remaining, target_remaining=y_remaining, DESCR=name) return data
def fetch_uci(name, data_home=None): """Fetch UCI dataset. Fetch a UCI dataset by name. More info at https://archive.ics.uci.edu/ml/datasets.html. Parameters ---------- name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. Returns ------- data : Bunch Dictionary-like object with all the data and metadata. """ dirname = os.path.join(get_data_home(data_home=data_home), 'uci', name) if not os.path.exists(dirname): os.makedirs(dirname) X, y, X_test, y_test, DESCR = _fetch(name, dirname=dirname) data = Bunch(data=X, target=y, data_test=X_test, target_test=y_test, inner_cv=None, outer_cv=None, DESCR=DESCR) return data
def fetch_indoor_pos(data_home=None, is_train=True, return_X_y=False, remove_dup=False): """Load the code rally 2019 dataset (classification). ================= ==================================== Samples total 25811 Dimensionality 12 Features continuous (float) ================= ==================================== Parameters ---------- data_home : string, optional Specify the folder for the datasets. is_train : bool, default=True Whether is train dataset. return_X_y : bool, default=False If True, returns ``(data, target_x, target_y, orig_data)`` instead of a Bunch object. remove_dup : bool, default=False If True, remove the duplicate lines with the same features, only keep one against one series of features. Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: - 'data', the data to learn removing duplicate entries. - 'target_x', the regression target x for each sample. - 'target_y', the regression target y for each sample. - 'orig_data', the data to learn. (data, target_x, target_y) : tuple if ``return_X_y`` is True """ data_home = get_data_home(data_home=data_home) indoorpos = _fetch_brute_indoor_pos(data_home=data_home, is_train=is_train, remove_dup=remove_dup) data = indoorpos.data target_x = indoorpos.target_x target_y = indoorpos.target_y orig_data = indoorpos.orig_data if return_X_y: return data, target_x, target_y, orig_data return Bunch(data=data, target_x=target_x, target_y=target_y, orig_data=orig_data)
def fetch_mnist(data_home=None): mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" data_home = get_data_home(data_home=data_home) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") if not os.path.exists(mnist_save_path): mnist_url = urllib.request.urlopen(mnist_alternative_url) with open(mnist_save_path, "wb") as matlab_file: copyfileobj(mnist_url, matlab_file)
def __init__(self, data_home=None, download_if_missing=True, frame_scale_factor=None, shuffle=False, random_state=None): self.data_home = get_data_home(data_home=data_home) self.download_if_missing = download_if_missing self.sd15ch1_home = os.path.join(self.data_home, SD15CH1_DIRNAME) self.framesdir = os.path.join(self.sd15ch1_home, "frames") self._scale_factor = None if frame_scale_factor is not None: if not (0 < frame_scale_factor <= 20): self.__err("frame_scale_factor parameter but be > 0 and < 20.", ValueError) self._scale_factor = float(frame_scale_factor) # Open or try download or raise an exception if dataset is unavailable _ensure_dataset_is_downloaded(self.sd15ch1_home, self.download_if_missing) # Read metadata file frames_metadata_path = os.path.join(self.framesdir, "metadata.csv.gz") self._info('Loading frames metadata from %s' % (frames_metadata_path, )) self._rawdata = pd.read_csv(frames_metadata_path) if shuffle: self._rawdata = self._rawdata.sample( frac=1, axis=0, random_state=random_state).reset_index(drop=True) for _rid, rseries in self._rawdata.iterrows(): dfi_dict = {} # Copy content for colname in self._rawdata: # .keys() dfi_dict[colname] = rseries[colname] # Add extra entries dfi_dict["image_path_absolute"] = os.path.join( self.framesdir, rseries["image_path"]) # string (path) dfi_dict["_scale_factor"] = self._scale_factor # float # dfi_dict["frame_uid"] = rid # int # does not survive shuffling # hint: use df.reindex(np.random.permutation(df.index)) and keep original uids # Store self.append(Frame(dfi_dict)) self._unique_background_ids = None self._unique_background_names = None self._unique_model_ids = None self._unique_model_names = None self._unique_modeltype_ids = None self._unique_modeltype_names = None
def fetch_keel(collection, name, data_home=None, nfolds=None, dobscv=False): """Fetch Keel dataset. Fetch a Keel dataset by collection and name. More info at http://sci2s.ugr.es/keel. Parameters ---------- collection : string Collection name. name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. nfolds : int, default=None Number of folds. Depending on the dataset, valid values are {None, 1, 5, 10}. dobscv : bool, default=False If folds are in {5, 10}, indicates that the cv folds are distribution optimally balanced stratified. Only available for some datasets. **kwargs : dict Optional key-value arguments Returns ------- data : Bunch Dictionary-like object with all the data and metadata. """ if collection not in COLLECTIONS: raise Exception('Avaliable collections are ' + str(list(COLLECTIONS))) dirname = os.path.join(get_data_home(data_home=data_home), 'keel', collection, name) if not os.path.exists(dirname): os.makedirs(dirname) nattrs, DESCR = _load_descr(collection, name, dirname=dirname) X, y, cv = _load_folds(collection, name, nfolds, dobscv, nattrs, dirname=dirname) data = Bunch(data=X, target=y, data_test=None, target_test=None, inner_cv=None, outer_cv=cv, DESCR=DESCR) return data
def fetch_mnist(data_home=None): # where to store the data data_home = get_data_home(data_home=data_home) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") # download if needed if not os.path.exists(mnist_save_path): print("Download MNIST to", mnist_save_path) mnist_url = urlopen( "http://home.htw-berlin.de/~hezel/files/data/mnist-original.mat") with open(mnist_save_path, "wb") as matlab_file: copyfileobj(mnist_url, matlab_file) return fetch_mldata('MNIST original')
def fetch_file(dataname, urlname, subfolder=None, data_home=None): """Fetch dataset. Fetch a file from a given url and stores it in a given directory. Parameters ---------- dataname: string Dataset name. urlname: string Dataset url. data_home: string, default=None Dataset directory. Returns ------- filename: string Name of the file. """ # check if this data set has been already downloaded data_home = pathlib.Path(get_data_home(data_home=data_home)) if subfolder: data_home = data_home / subfolder data_home = data_home / dataname if not data_home.exists(): data_home.mkdir(parents=True) filename = data_home / basename(normpath(urlname)) # if the file does not exist, download it if not filename.exists(): try: data_url = urlopen(urlname) except HTTPError as e: if e.code == 404: e.msg = "Dataset '%s' not found." % dataname raise # store file try: with open(filename, 'w+b') as data_file: copyfileobj(data_url, data_file) except Exception: filename.unlink() raise data_url.close() return filename
def fetch_mnist(data_home=None): ''' Function to get original mnist data set if data does not already exist locally :param data_home: :return: ''' mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" data_home = get_data_home(data_home=data_home) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") if not os.path.exists(mnist_save_path): mnist_url = urllib.request.urlopen(mnist_alternative_url) with open(mnist_save_path, "wb") as matlab_file: copyfileobj(mnist_url, matlab_file)
def load_credit_data(): sk_data_dir = get_data_home() archive = RemoteFileMetadata( filename='default of credit card clients.xls', url='https://archive.ics.uci.edu/ml/machine-learning-databases/' '00350/default%20of%20credit%20card%20clients.xls', checksum=('30c6be3abd8dcfd3e6096c828bad8c2f' '011238620f5369220bd60cfc82700933')) if not exists(join(sk_data_dir, archive.filename)): _fetch_remote(archive, dirname=sk_data_dir) data = pd.read_excel(join(sk_data_dir, archive.filename), sheet_name='Data', header=1) dataset = Bunch(data=(data.drop('default payment next month', axis=1)), target=np.array(data['default payment next month'])) return dataset
def fetch_file(dataname, urlname, data_home=None): """Fetch dataset. Fetch a file from a given url and stores it in a given directory. Parameters ---------- dataname: string Dataset name. urlname: string Dataset url. data_home: string, default=None Dataset directory. Returns ------- filename: string Name of the file. """ # check if this data set has been already downloaded data_home = get_data_home(data_home=data_home) data_home = join(data_home, dataname) if not exists(data_home): makedirs(data_home) filename = join(data_home, basename(normpath(urlname))) # if the file does not exist, download it if not exists(filename): try: data_url = urlopen(urlname) except HTTPError as e: if e.code == 404: e.msg = "Dataset '%s' not found." % dataname raise # store file try: with open(filename, 'w+b') as data_file: copyfileobj(data_url, data_file) except Exception: remove(filename) raise data_url.close() return filename
def load_data(subset): ''' Fetch 20newsgroup data and return it in tf-idf matrix form. Parameters ---------- subset: str subset of date to fetch, whose value can be train, test, and all. Returns ------- A: array, shape (number of documents, number of words) vocab_matrix: array, shape (number of words, number of words) This is the matrix of eigenwords of words in the dictionary. vocabulary: list List contains vocabulary. Note that they actual word string. ''' # Data preprocessing # ================================================================ # Fetch 20newsgroups data, removing its header, footer and quotes. docs = datasets.fetch_20newsgroups( subset=subset, remove=('headers', 'footers', 'quotes') ) # ================================================================ # Convert the raw documents data into tf-idf matrix # ================================================================ data_home = get_data_home() vocabulary_path = os.path.join(data_home, 'vocabulary.txt') vocabulary = load_vocabulary(vocabulary_path) vectorizer = TfidfVectorizer( vocabulary=vocabulary, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True ) docs_tfidf = vectorizer.fit_transform(docs.data) vocab_tfidf = vectorizer.transform(vocabulary) # ================================================================ return docs_tfidf, vocab_tfidf, vocabulary
def fetch_or_load_lymphography(): global Lymphography data_home = get_data_home() if not exists(data_home): makedirs(data_home) file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff') if not exists(file_path): data_archive_path = _fetch_remote(Lymphography) tf = tarfile.open(data_archive_path) tf.extractall(data_home) remove(data_archive_path) f_descriptor = open(file_path, 'r') dataset = arff.load(f_descriptor) df = pd.DataFrame(dataset['data']) feature = df.iloc[:,1:19].to_numpy() ground_truth = np.ones(148) for i in [43, 44, 45, 103, 132, 147]: ground_truth[i] = -1 return (feature, ground_truth)
def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True): """Helper function to download any missing LFW data""" data_home = get_data_home(data_home=data_home) lfw_home = join(data_home, "lfw_home") if funneled: archive_path = join(lfw_home, FUNNELED_ARCHIVE_NAME) data_folder_path = join(lfw_home, "lfw_funneled") archive_url = BASE_URL + FUNNELED_ARCHIVE_NAME else: archive_path = join(lfw_home, ARCHIVE_NAME) data_folder_path = join(lfw_home, "lfw") archive_url = BASE_URL + ARCHIVE_NAME if not exists(lfw_home): makedirs(lfw_home) for target_filename in TARGET_FILENAMES: target_filepath = join(lfw_home, target_filename) if not exists(target_filepath): if download_if_missing: url = BASE_URL + target_filename logger.warn("Downloading LFW metadata: %s", url) urllib.urlretrieve(url, target_filepath) else: raise IOError("%s is missing" % target_filepath) if not exists(data_folder_path): if not exists(archive_path): if download_if_missing: logger.warn("Downloading LFW data (~200MB): %s", archive_url) urllib.urlretrieve(archive_url, archive_path) else: raise IOError("%s is missing" % target_filepath) import tarfile logger.info("Decompressing the data archive to %s", data_folder_path) tarfile.open(archive_path, "r:gz").extractall(path=lfw_home) remove(archive_path) return lfw_home, data_folder_path
def __init__(self, data_home=None, download_if_missing=True, variant="05-corrected-nexus-scaled33"): self.data_home = get_data_home(data_home=data_home) self.download_if_missing = download_if_missing self.sd15ch1_home = os.path.join(self.data_home, SD15CH1_DIRNAME) self.modelsdir = os.path.join(self.sd15ch1_home, "models") # Open or try download or raise an exception if dataset is unavailable _ensure_dataset_is_downloaded(self.sd15ch1_home, self.download_if_missing) # Read metadata file models_metadata_path = os.path.join(self.modelsdir, "metadata.csv.gz") self._info('Loading frames metadata from %s' % (models_metadata_path, )) df = pd.read_csv(models_metadata_path) # Filter the variant we want to load if variant not in Models.VARIANTS: self._err("Unknown model variant: '%s'." % variant, ValueError) self._rawdata = df[df["model_cat"] == variant] for _rid, rseries in self._rawdata.iterrows(): mdli_dict = {} # Copy content for colname in self._rawdata: # .keys() mdli_dict[colname] = rseries[colname] # Add extra entries mdli_dict["image_path_absolute"] = os.path.join( self.modelsdir, rseries["image_path"]) # string (path) # Store self.append(Model(mdli_dict)) self._unique_model_ids = None self._unique_model_names = None self._unique_modeltype_ids = None self._unique_modeltype_names = None
def fetch_raetsch(name, data_home=None): """Fetch Gunnar Raetsch's dataset. Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are 'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart', 'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and 'waveform'. More info at https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets. Parameters ---------- name : string Dataset name. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders. Returns ------- data : Bunch Dictionary-like object with all the data and metadata. """ if name not in DATASETS: raise Exception('Avaliable datasets are ' + str(list(DATASETS))) dirname = os.path.join(get_data_home(data_home=data_home), 'raetsch') if not os.path.exists(dirname): os.makedirs(dirname) filename = _fetch_remote(ARCHIVE, dirname=dirname) X, y, train_splits, test_splits = loadmat(filename)[name][0][0] cv = ((X[tr - 1], y[tr - 1], X[ts - 1], y[ts - 1]) for tr, ts in zip(train_splits, test_splits)) return Bunch(data=X, target=y, data_test=None, target_test=None, inner_cv=None, outer_cv=cv, DESCR=name)
def load_data(subset): ''' Fetch 20newsgroup data and return it in tf-idf matrix form. Parameters ---------- subset: str subset of date to fetch, whose value can be train, test, and all. Returns ------- A: array, shape (number of documents, number of words) vocab_matrix: array, shape (number of words, number of words) This is the matrix of eigenwords of words in the dictionary. vocabulary: list List contains vocabulary. Note that they actual word string. ''' # Data preprocessing # ================================================================ # Fetch 20newsgroups data, removing its header, footer and quotes. docs = datasets.fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes')) # ================================================================ # Convert the raw documents data into tf-idf matrix # ================================================================ data_home = get_data_home() vocabulary_path = os.path.join(data_home, 'vocabulary.txt') vocabulary = load_vocabulary(vocabulary_path) vectorizer = TfidfVectorizer(vocabulary=vocabulary, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) docs_tfidf = vectorizer.fit_transform(docs.data) vocab_tfidf = vectorizer.transform(vocabulary) # ================================================================ return docs_tfidf, vocab_tfidf, vocabulary
def __init__(self, N=6, ds_size=80, ds="BAS"): self.ds = ds if ds == "BAS": self.BAS = BAS(args.N) self.ds_size = ds_size self.S = self.BAS.getSample(size=ds_size) elif ds == "MNIST": if (N != 28): raise ValueError("Please use N = 28 for the MNIST data set") try: mnist = fetch_openml('mnist_784') except: print( "Could not download MNIST data from mldata.org, trying alternative..." ) mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" data_home = get_data_home(data_home=None) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") if not os.path.exists(mnist_save_path): print("Downloading from ", mnist_alternative_url) urllib.request.urlretrieve(mnist_alternative_url, mnist_save_path) print("Now calling fetch_mldata once more") mnist = fetch_openml('mnist_784') label = np.asarray(list(map(int, mnist['target']))) mnist = mnist.data mnist = ((mnist / 255.0) + 0.5).astype(int) images = [] for i in range(ds_size): digit = i % 10 u = np.where(label == digit)[0] images.append(mnist[u[i // 10], None, :]) self.S = np.concatenate(images, axis=0) self.ds_size = ds_size else: raise ValueError("Unknown data set name")
def _get_latest_version_offline(package_name): """ Get the latest downloaded version of the package. Returns None if not found. """ home = pathlib.Path(get_data_home()) # Should allow providing data home? downloaded_packages = tuple(home.glob(package_name + "_*.tar.gz")) if downloaded_packages: versions = [ LooseVersion(p.name[(len(package_name) + 1):-len(".tar.gz")]) for p in downloaded_packages] versions.sort() latest_version = versions[-1] return str(latest_version) else: return None
def fetch_classic(data_home=None, subset='all', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True): """Load the filenames and data from the 20 newsgroups dataset. Read more in the :ref:`User Guide <20newsgroups>`. Parameters ---------- subset: 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. data_home: optional, default: None Specify a download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. categories: None or collection of string or unicode If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle: bool, optional Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state: numpy random number generator or seed integer Used to shuffle the dataset. download_if_missing: optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. remove: tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. 'headers' follows an exact standard; the other filters are not always correct. """ data_home = get_data_home(data_home=data_home) cache_path = _pkl_filepath(data_home, CACHE_NAME) classic_home = os.path.join(data_home, CLASSIC_HOME) cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode( compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: if download_if_missing: cache = download_classic(target_dir=classic_home, cache_path=cache_path) else: raise IOError('classic dataset not found') if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) data.description = 'the classic dataset' # if 'headers' in remove: # data.data = [strip_newsgroup_header(text) for text in data.data] # if 'footers' in remove: # data.data = [strip_newsgroup_footer(text) for text in data.data] # if 'quotes' in remove: # data.data = [strip_newsgroup_quoting(text) for text in data.data] if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data
def __init__(self, N=28, input=100, ds_size=80, ds="MNIST", speed_threshold=50): self.ds = ds if ds == "BAS": bas_size = 30 data_home = get_data_home(data_home=None) data_home = os.path.join(data_home, 'BAS') datafile = os.path.join(data_home, 'BAS.pkl') if not os.path.exists(data_home): os.makedirs(data_home) if not os.path.exists(datafile): _BAS = BAS(N) BAS_data = _BAS.getSample(size=bas_size) f = open(datafile, "wb") pickle.dump(BAS_data, f) f.close() else: f = open(datafile, "rb") BAS_data = pickle.load(f) f.close() S = BAS_data[0:ds_size, None, :] self.S = np.concatenate(S, axis=0) self.ds_size = ds_size self.test_data = self.S self.ground_truth = self.S elif ds == "4bits": self.S = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) self.ds_size = 4 self.test_data = self.S self.ground_truth = self.S elif ds == "traffic": traffic_data = Traffic_data(input, 0, 50) self.S = traffic_data.train self.test_data = traffic_data.test self.ground_truth = traffic_data.ground_truth self.ds_size = traffic_data.ds_size elif ds == "small_traffic": pickle_out = open("small_traffic_dataset.pkl", "rb") data = pickle.load(pickle_out) pickle_out.close() self.S = data["train"] > speed_threshold self.ds_size = self.S.shape[0] self.test_data = data["test"] > speed_threshold elif ds == "MNIST": if (N != 28): raise ValueError("Please use N = 28 for the MNIST data set") try: custom_data_home = "C:/Users/Hamco/scikit_learn_data" mnist = fetch_mldata('MNIST original', data_home=custom_data_home) except: print( "Could not download MNIST data from mldata.org, trying alternative..." ) mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" data_home = get_data_home(data_home=None) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") if not os.path.exists(mnist_save_path): print("Downloading from ", mnist_alternative_url) urllib.urlretrieve(mnist_alternative_url, mnist_save_path) print("Now calling fetch_mldata once more") mnist = fetch_mldata('MNIST original') label = mnist['target'] mnist = mnist.data mnist = ((mnist / 255.0) + 0.5).astype(int) images = [] for i in range(ds_size): digit = i % 10 u = np.where(label == digit)[0] images.append(mnist[u[i // 10], None, :]) self.S = np.concatenate(images, axis=0) self.resize(20, 20) self.ds_size = ds_size else: raise ValueError("Unknown data set name")
import argparse parser = argparse.ArgumentParser() parser.add_argument("-n", "--name", default='MNIST_A_default') parser.add_argument("-i", "--iteration", type=int, default=1000) tp = lambda x:list(map(int, x.split(','))) parser.add_argument("-nl", '--number_list', type=tp, default="0,1,2,3,4,5,6,7,8,9") parser.add_argument("-pxl", '--pos_x_list', type=tp, default="0,36") parser.add_argument("-pyl", '--pos_y_list', type=tp, default="0,36") parser.add_argument("-al", '--angle_list', type=tp, default="-45,0,45") parser.add_argument("-sl", '--scale_list', type=tp, default="28,16") args = parser.parse_args() mnist_path = join(get_data_home(), "mldata/mnist-original.mat") if not exists(os.path.dirname(mnist_path)): os.makedirs(os.path.dirname(mnist_path)) mnist_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" urllib.request.urlretrieve(mnist_url, mnist_path) mnist = fetch_mldata('MNIST original', data_home=get_data_home()) mnist_data = mnist["data"]/255. mnist_target = mnist["target"].astype(int) train_X, test_X, train_y, test_y = train_test_split(mnist_data, mnist_target, random_state=42, test_size=10000) test_X, valid_X, test_y, valid_y = train_test_split(test_X, test_y, random_state=42, test_size=2000) def change_scale(image, scale): image = resize(image, (scale, scale), mode='constant') pad_size = int((28-scale)/2)
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Notes ------ This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1623,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (619,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- * See examples/applications/plot_species_distribution_modeling.py for an example of using this dataset with scikit-learn """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 if not exists(join(data_home, DATA_ARCHIVE_NAME)): print('Downloading species data from %s to %s' % (SAMPLES_URL, data_home)) X = np.load(BytesIO(urlopen(SAMPLES_URL).read())) for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) print('Downloading coverage data from %s to %s' % (COVERAGES_URL, data_home)) X = np.load(BytesIO(urlopen(COVERAGES_URL).read())) coverages = [] for f in X.files: fhandle = BytesIO(X[f]) print(' - converting', f) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME), compress=9) else: bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME)) return bunch
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Notes ------ This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1623,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (619,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- * See examples/applications/plot_species_distribution_modeling.py for an example of using this dataset with scikit-learn """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 if not exists(join(data_home, DATA_ARCHIVE_NAME)): print 'Downloading species data from %s to %s' % (SAMPLES_URL, data_home) X = np.load(StringIO(urllib2.urlopen(SAMPLES_URL).read())) for f in X.files: fhandle = StringIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) print 'Downloading coverage data from %s to %s' % (COVERAGES_URL, data_home) X = np.load(StringIO(urllib2.urlopen(COVERAGES_URL).read())) coverages = [] for f in X.files: fhandle = StringIO(X[f]) print ' - converting', f coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME)) else: bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME)) return bunch
def _fetch_brute_indoor_pos(data_home=None, is_train=True, remove_dup=False): """Load the indoor position dataset Parameters ---------- data_home : string, optional Specify the folder for the datasets. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (xxx, 12) Each row corresponds to the 12 features in the dataset. dataset.target : numpy array of shape (xxx,) Each value corresponds to x,y """ data_home = get_data_home(data_home=data_home) if is_train: indoorpos_trainfile = join(data_home, "train.csv") else: indoorpos_trainfile = join(data_home, "test.csv") available = exists(indoorpos_trainfile) if available: dt = [('x', int), ('y', int), ('2.1G(10)', float), ('2.1G(11)', float), ('2.1G(12)', float), ('2.1G(4)', float), ('2.1G(7)', float), ('2.1G(8)', float), ('3.5G(10)', float), ('3.5G(11)', float), ('3.5G(12)', float), ('3.5G(4)', float), ('3.5G(7)', float), ('3.5G(8)', float)] DT = np.dtype(dt) file_ = open(indoorpos_trainfile, mode='r') Xy = [] linenum = 0 for line in file_.readlines(): if linenum > 0: Xy.append(line.replace('\n', '').split(',')) linenum = linenum + 1 file_.close() Xy = np.asarray(Xy, dtype=object) if is_train: for j in range(len(dt)): Xy[:, j] = Xy[:, j].astype(DT[j]) else: for j in range(2, len(dt)): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, 2:14] if not is_train: return Bunch(data=X, target_x=[], target_y=[], orig_data=X) dict = {} indexes = [] X1 = [] y1 = Xy[:, 0:1] y2 = Xy[:, 1:2] #duplicate entries processing for i in range(len(X)): if not remove_dup: indexes.append(i) X1.append(X[i]) continue str1 = "" str1 = ''.join(str(e) for e in X[i]) if str1 in dict: continue dict[str1] = i indexes.append(i) X1.append(X[i]) new_y1 = [] new_y2 = [] y11 = np.asarray(np.ravel(y1), dtype=np.int) y21 = np.asarray(np.ravel(y2), dtype=np.int) for i in indexes: new_y1 = np.append(new_y1, y11[i]) new_y2 = np.append(new_y2, y21[i]) elif not available: raise IOError("Data not found") return Bunch(data=X1, target_x=new_y1, target_y=new_y2, orig_data=X)
from sklearn.datasets.base import get_data_home get_data_home() from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') import pandas as pd pixels = pd.DataFrame(mnist.data) labels = pd.DataFrame(mnist.target) pixels.loc[0].values labels.loc[0].values import numpy as np #%matplotlib inline import matplotlib.pyplot as plt label = labels.loc[0] pixel = pixels.loc[0] pixel = np.array(pixel, dtype='uint8') pixel = pixel.reshape((28,28)) plt.title('Label is {label}'.format(label=label)) plt.imshow(pixel, cmap='gray') plt.show() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=1/7.0) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier()
def __download_open_dataset(data_home=None, download_if_missing=True): """Helper function to download any missing SD15-CH1 data. The dataset will be stored like this: ${data_home}/smartdoc15-ch1_home/frames: ├── background01 │ ├── datasheet001 │ │ ├── frame_0001.jpeg │ │ ├── [...] │ │ └── frame_0235.jpeg │ ├── [...] │ └── tax005 │ └── [...] ├── background02 | └── [...] ├── background03 | └── [...] ├── background04 | └── [...] ├── background05 | └── [...] └── metadata.csv.gz ${data_home}/smartdoc15-ch1_home/models: ├── 01-original │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 02-edited │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 03-captured-nexus │ ├── datasheet001.jpg # JPG images here │ ├── [...] │ └── tax005.jpg ├── 04-corrected-nexus │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── 05-corrected-nexus-scaled33 │ ├── datasheet001.png │ ├── [...] │ └── tax005.png ├── correct_perspective.m └── original_datasets_files.txt """ data_home = get_data_home(data_home=data_home) sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME) if not os.path.exists(sd15ch1_home): os.makedirs(sd15ch1_home) data_dirs = {} for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT): data_folder_path = os.path.join(sd15ch1_home, subdir) data_dirs[subdir] = data_folder_path if not os.path.exists(data_folder_path): archive_path = os.path.join(sd15ch1_home, archive.filename) # (later) FIXME this is a naive test for existing files if not os.path.exists(archive_path): if download_if_missing: __info("Downloading file %s (%s): %s" % (archive.filename, size, archive.url)) _fetch_remote(archive, dirname=sd15ch1_home) else: __err("%s is missing" % archive_path, IOError) __info("Decompressing the data archive to %s" % (data_folder_path, )) tarfile.open(archive_path, "r:gz").extractall(path=data_folder_path) os.remove(archive_path) return data_dirs
def get_sd15ch1_basedir_models(data_home=None): data_home = get_data_home(data_home=data_home) sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME) basedir = os.path.join(sd15ch1_home, "models") return basedir
# coding: utf-8 # In[12]: ### fetch_mldata('MNIST original')でRemoteDisconnectedエラーが出た場合 # https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat # ↑から mnist-original.mat をダウンロード from sklearn.datasets.base import get_data_home print(get_data_home()) # これで表示されるパスに mnist-original.mat を置く # In[13]: ### fetch_mldata('MNIST original')でRemoteDisconnectedエラーのため書き換え # from sklearn.datasets import fetch_mldata # mnist = fetch_mldata('MNIST original', data_home=".") # data_homeは保存するフォルダ from sklearn.datasets import fetch_openml mnist = fetch_openml( 'mnist_784', version=1, ) # MNISTデータの前処理 # 28x28ピクセルごとの0-255の数値を0-1に変換 X = mnist.data / 255 # 正解ラベルを取得 y = mnist.target
def fetch_uci_glass_outlier(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI glass data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 6 Samples total 214 Dimensionality 9 Features real ================= ===================== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. shuffle : boolean, optional If True the order of the dataset is shuffled to avoid having images of the same person grouped. random_state : int, RandomState instance or None (default=0) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : numpy array of shape (214, 9) Each row corresponds to a glass feature of 9 dimension target : numpy array of shape (214, ) Labels associated to each glas. Those labels are from [1,2,3,5,6,7] and correspond to the Subject IDs. """ global GLASS data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'uci_glass_outlier.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") print('downloading UCI GLASS from %s to %s' % (GLASS.url, data_home)) data_path = _fetch_remote(GLASS, dirname=data_home) glass = np.genfromtxt(data_path, delimiter=",") # the class 6 (minority) as outlier and all other classes as inliers glass[:, -1] = 2 * (glass[:, -1] != 6) - 1 _joblib.dump(glass, filepath, compress=6) remove(data_path) else: glass = _joblib.load(filepath) feature = glass[:, 1:-1] target = glass[:, -1] if shuffle: random_state = check_random_state(random_state) order = random_state.permutation(len(glass)) feature = glass[order] target = target[order] return (feature, target)
max_for_C = (accuracy["C"][argmax], accuracy["avg_accuracy"][argmax]) plt.annotate(f"C={max_for_C[0]} Avg. Accuracy={max_for_C[1]:.3f}..", xy=max_for_C) # plt.xlim((eta_0_values[0], eta_0_values[-1])) plt.xticks(np.arange(len(c_values)), c_values) plt.xlabel("C") plt.xscale('log') plt.ylabel("Accuracy") plt.savefig(f"section-2-b-{label}.png") plt.show() return max_for_C[0] if __name__ == "__main__": from sklearn.datasets.base import get_data_home print(get_data_home()) train_data, train_labels, validation_data, validation_labels, test_data, test_labels = helper( ) # section_1_a() # section_1_b() # section_1_c() # section_1_d() # # section_2_a() # section_2_b() # section_2_c() section_2_d()
def fetch_classic(data_home=None, subset='all', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True): """Load the filenames and data from the 20 newsgroups dataset. Read more in the :ref:`User Guide <20newsgroups>`. Parameters ---------- subset: 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. data_home: optional, default: None Specify a download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. categories: None or collection of string or unicode If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle: bool, optional Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state: numpy random number generator or seed integer Used to shuffle the dataset. download_if_missing: optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. remove: tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. 'headers' follows an exact standard; the other filters are not always correct. """ data_home = get_data_home(data_home=data_home) cache_path = _pkl_filepath(data_home, CACHE_NAME) classic_home = os.path.join(data_home, CLASSIC_HOME) cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: if download_if_missing: cache = download_classic(target_dir=classic_home, cache_path=cache_path) else: raise IOError('classic dataset not found') if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) data.description = 'the classic dataset' # if 'headers' in remove: # data.data = [strip_newsgroup_header(text) for text in data.data] # if 'footers' in remove: # data.data = [strip_newsgroup_footer(text) for text in data.data] # if 'quotes' in remove: # data.data = [strip_newsgroup_quoting(text) for text in data.data] if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data