def test_data_home(data_home): # get_data_home will point to a pre-existing folder data_home = get_data_home(data_home=data_home) assert data_home == data_home assert os.path.exists(data_home) # clear_data_home will delete both the content and the folder it-self clear_data_home(data_home=data_home) assert not os.path.exists(data_home) # if the folder is missing it will be created again data_home = get_data_home(data_home=data_home) assert os.path.exists(data_home)
def setup_working_with_text_data(): if IS_PYPY and os.environ.get('CI', None): raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def stream_reuters_documents(data_path=None): """Iterate over documents of the Reuters dataset. The Reuters archive will automatically be downloaded and uncompressed if the `data_path` directory does not exist. Documents are represented as dictionaries with 'body' (str), 'title' (str), 'topics' (list(str)) keys. """ DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'reuters21578-mld/reuters21578.tar.gz') ARCHIVE_FILENAME = 'reuters21578.tar.gz' if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): total_sz_mb = '%.2f MB' % (size / 1e6) current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) if _not_in_sphinx(): sys.stdout.write( '\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb)) archive_path = os.path.join(data_path, ARCHIVE_FILENAME) urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): sys.stdout.write('\r') print("untarring Reuters dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): yield doc
from mrex.ensemble import ExtraTreesClassifier from mrex.ensemble import RandomForestClassifier from mrex.dummy import DummyClassifier from mrex.kernel_approximation import Nystroem from mrex.kernel_approximation import RBFSampler from mrex.metrics import zero_one_loss from mrex.pipeline import make_pipeline from mrex.svm import LinearSVC from mrex.tree import DecisionTreeClassifier from mrex.utils import check_array from mrex.linear_model import LogisticRegression from mrex.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
def setup_twenty_newsgroups(): data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def setup_rcv1(): check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = join(get_data_home(), "RCV1") if not exists(rcv1_dir): raise SkipTest("Download RCV1 dataset to run this test.")
def setup_labeled_faces(): data_home = get_data_home() if not exists(join(data_home, 'lfw_home')): raise SkipTest("Skipping dataset loading doctests")
import numpy as np from joblib import Memory from mrex.datasets import fetch_covtype, get_data_home from mrex.svm import LinearSVC from mrex.linear_model import SGDClassifier, LogisticRegression from mrex.naive_bayes import GaussianNB from mrex.tree import DecisionTreeClassifier from mrex.ensemble import RandomForestClassifier, ExtraTreesClassifier from mrex.ensemble import GradientBoostingClassifier from mrex.metrics import zero_one_loss from mrex.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int)