Beispiel #1
0
def movielens100k(processed=False):
    """Data Loader for the Movielens-100k dataset. It consists of 100,000 ratings (1-5) from 943 users on 1682 movies.

    F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
    Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
    DOI=http://dx.doi.org/10.1145/2827872

    Parameters
    ----------
    processed: bool, default False,
        if False, returns the raw data in a list of lists. If True, the user-item ratings matrix of shape (943, 1682)

    Returns
    -------
    ratings: depending on the value of `processed`, a list of lists or user-item ratings matrix (np.ndarray)
    """
    data_home = get_ml_data_dir_path()
    if processed:
        data_path = data_home / 'movielens-100k' / 'ratings_mat.npy'
        ratings = np.load(str(data_path))
    else:
        data_path = data_home / 'movielens-100k' / 'u.data'
        with open(str(data_path)) as f:
            ratings = [
                parse_line(line, sep='\t')
                for line in itertools.islice(f, 0, None)
            ]
        header = ['userId', 'movieId', 'rating']
        ratings.insert(0, header)
    return ratings
Beispiel #2
0
def STL10(unlabeled=False):
    """Data Loader for the STL10 dataset.

    Parameters
    ----------
    unlabeled: bool, default to False,
        if `True` returns also the unlabeled part of the dataset

    Returns
    -------
    (X_train, y_train) : tuple of np.ndarray of np.uint8, of shape (5000, 3, 96, 96) and (5000,)
        train STL10 images and labels.
    (X_test, y_test) : tuple of np.ndarray of np.uint8, of shape (8000, 3, 96, 96) and (8000,)
        test STL10 images and labels.
    X_unlabeled: np.ndarray of np.uint8, of shape (100000, 3, 96, 96),
        unlabeled images from STL10.
    """
    url = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'

    data_home = get_ml_data_dir_path()
    file_name = url.split('/')[-1]
    file_path = data_home / 'STL10' / file_name

    if not (data_home / 'STL10').is_dir():
        os.mkdir(str(data_home / 'STL10'))
        print('Downloading {}'.format(url))
        download(url, str(data_home / 'STL10'))
        tarfile.open(str(file_path),
                     'r:gz').extractall(str(data_home / 'STL10'))

    binary_data_path = data_home / 'STL10/stl10_binary'
    train_images_path = str(binary_data_path / 'train_X.bin')
    test_images_path = str(binary_data_path / 'test_X.bin')
    train_labels_path = str(binary_data_path / 'train_y.bin')
    test_labels_path = str(binary_data_path / 'test_y.bin')

    with open(train_images_path, 'rb') as f:
        train_images = np.fromfile(f, dtype=np.uint8)
        X_train = np.reshape(train_images, (-1, 3, 96, 96))
    with open(test_images_path, 'rb') as f:
        test_images = np.fromfile(f, dtype=np.uint8)
        X_test = np.reshape(test_images, (-1, 3, 96, 96))
    with open(train_labels_path, 'rb') as f:
        y_train = np.fromfile(f, dtype=np.uint8)
    with open(test_labels_path, 'rb') as f:
        y_test = np.fromfile(f, dtype=np.uint8)

    if unlabeled:
        unlabeled_images_path = str(binary_data_path / 'unlabeled_X.bin')
        with open(unlabeled_images_path, 'rb') as f:
            unlabeled_images = np.fromfile(f, dtype=np.uint8)
            X_unlabeled = np.reshape(unlabeled_images, (-1, 3, 96, 96))
        return (X_train, y_train), (X_test, y_test), X_unlabeled

    return (X_train, y_train), (X_test, y_test)
Beispiel #3
0
def CIFAR10():
    """Data Loader for the CIFAR10 dataset.

    Returns
    -------
    (X_train, y_train) : tuple of np.ndarray of np.uint8, of shape (50000, 3, 96, 96) and (50000,)
        train CIFAR10 images and labels.
    (X_test, y_test) : tuple of np.ndarray of np.uint8, of shape (10000, 3, 96, 96) and (10000,)
        test CIFAR10 images and labels.
    """

    url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    data_home = get_ml_data_dir_path()
    file_name = url.split('/')[-1]
    file_path = data_home / 'CIFAR10' / file_name

    if not (data_home / 'CIFAR10').is_dir():
        os.mkdir(str(data_home / 'CIFAR10'))
        print('Downloading {}'.format(url))
        download(url, str(data_home / 'CIFAR10'))
        tarfile.open(str(file_path),
                     'r:gz').extractall(str(data_home / 'CIFAR10'))

    binary_data_path = data_home / 'CIFAR10/cifar-10-batches-py/'
    batches = [
        'data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4',
        'data_batch_5'
    ]

    train_images = []
    train_labels = []
    for batch_file in batches:
        fo = open(str(binary_data_path / batch_file), 'rb')
        batch = pickle.load(fo, encoding='latin1')
        train_images.append(batch['data'])
        train_labels.append(batch['labels'])

    test_fo = open(str(binary_data_path / 'test_batch'), 'rb')
    test_batch = pickle.load(test_fo, encoding='latin1')
    test_images = test_batch['data']
    test_labels = test_batch['labels']

    # noinspection PyArgumentList
    X_train = np.concatenate(train_images, axis=0).reshape(-1, 3, 32, 32)
    X_test = test_images.reshape(-1, 3, 32, 32)

    y_train = np.concatenate(train_labels)
    y_test = np.asarray(test_labels)

    return (X_train, y_train), (X_test, y_test)
Beispiel #4
0
def CIFAR100():
    """Data Loader for the CIFAR100 dataset.

    Returns
    -------
    (X_train, y_train) : tuple of np.ndarray of np.uint8, of shape (50000, 3, 96, 96) and (50000,)
        train CIFAR100 images and labels.
    (X_test, y_test) : tuple of np.ndarray of np.uint8, of shape (10000, 3, 96, 96) and (10000,)
        test CIFAR100 images and labels.
    """

    url = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
    data_home = get_ml_data_dir_path()
    file_name = url.split('/')[-1]
    file_path = data_home / 'CIFAR100' / file_name

    if not (data_home / 'CIFAR100').is_dir():
        os.mkdir(str(data_home / 'CIFAR100'))
        print('Downloading {}'.format(url))
        download(url, str(data_home / 'CIFAR100'))
        tarfile.open(str(file_path),
                     'r:gz').extractall(str(data_home / 'CIFAR100'))

    binary_data_paths = [
        str(data_home / 'CIFAR100/cifar-100-python/train'),
        str(data_home / 'CIFAR100/cifar-100-python/test')
    ]

    fo = open(binary_data_paths[0], 'rb')
    train_data = pickle.load(fo, encoding='latin1')
    train_images = train_data['data']
    train_labels = train_data['fine_labels']

    fo = open(binary_data_paths[1], 'rb')
    test_data = pickle.load(fo, encoding='latin1')
    test_images = test_data['data']
    test_labels = test_data['fine_labels']

    X_train = train_images.reshape(-1, 3, 32, 32)
    X_test = test_images.reshape(-1, 3, 32, 32)

    y_train = np.asarray(train_labels)
    y_test = np.asarray(test_labels)

    return (X_train, y_train), (X_test, y_test)
Beispiel #5
0
def FashionMNIST():
    """Data Loader for the FashionMNIST dataset.

    Returns
    -------
    (X_train, y_train) : tuple of np.ndarray of np.uint8, of shape (60000, 28, 28) and (60000,)
        train flattened FashionMNIST images and labels.
    (X_test, y_test) : tuple of np.ndarray of np.uint8, of shape (10000, 28, 28) and (10000,)
        test flattened FashionMNIST images and labels.
    """
    urls = [
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
    ]

    data_home = get_ml_data_dir_path()

    if not (data_home / 'FashionMNIST').is_dir():
        os.mkdir(str(data_home / 'FashionMNIST'))
        for url in urls:
            print('Downloading {}'.format(url))
            download(url, str(data_home / 'FashionMNIST'))

    paths = []
    for url in urls:
        paths.append(str(data_home / 'FashionMNIST' / url.split('/')[-1]))
    y_train = np.frombuffer(gzip.open(paths[1], 'rb').read(),
                            np.uint8,
                            offset=8)
    X_train = np.frombuffer(gzip.open(paths[0], 'rb').read(),
                            np.uint8,
                            offset=16).reshape(-1, 28, 28)
    y_test = np.frombuffer(gzip.open(paths[3], 'rb').read(),
                           np.uint8,
                           offset=8)
    X_test = np.frombuffer(gzip.open(paths[2], 'rb').read(),
                           np.uint8,
                           offset=16).reshape(-1, 28, 28)
    return (X_train, y_train), (X_test, y_test)
Beispiel #6
0
def movielens20m(processed=False, id_to_movie=False):
    """Data Loader for the Movielens-20m dataset. It consists of 20000263 ratings (1-5) from 138493 users on 27278
    movies.

    F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
    Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
    DOI=http://dx.doi.org/10.1145/2827872

    Parameters
    ----------
    processed: bool, default False,
        if False, returns the raw data in a list of lists. If True, the user-item ratings matrix of shape
        (138493, 27278)
    id_to_movie: bool, default False,
        if True returns also the mapping from movieId to movie name.

    Returns
    -------
    ratings: depending on the value of `processed`, a list of lists or user-item ratings matrix (np.ndarray)
    id_to_movie_mapping: None or list of lists,
        mapping between movieId and movie name.
    """
    data_home = get_ml_data_dir_path()
    if processed:
        data_path = data_home / 'movielens-20m' / 'ratings_mat.npy'
        ratings = np.load(str(data_path))
    else:
        data_path = data_home / 'movielens-20m' / 'ratings.csv'
        with open(str(data_path)) as f:
            ratings = [
                parse_line(line) for line in itertools.islice(f, 0, None)
            ]
    id_to_movie_mapping = None
    if id_to_movie:
        data_path = data_home / 'movielens-20m' / 'movies.csv'
        with open(str(data_path)) as f:
            id_to_movie_mapping = [
                parse_line(line) for line in itertools.islice(f, 0, None)
            ]
    return ratings, id_to_movie_mapping
Beispiel #7
0
def SignMNIST():
    """Data Loader for the SignMNIST dataset. Each training and test case represents
    a label (0-25) as a one-to-one map for each alphabetic letter A-Z.

    https://www.kaggle.com/datamunge/sign-language-mnist/home

    Returns
    -------
    (X_train, y_train) : tuple of np.ndarray of np.uint8, of shape (27455, 784) and (27455,)
        train flattened SignMNIST images and labels.
    (X_test, y_test) : tuple of np.ndarray of np.uint8, of shape (7172, 784) and (7172,)
        test flattened SignMNIST images and labels.
    """
    csvfiles = ['sign_mnist_train.csv', 'sign_mnist_test.csv']
    data_home = get_ml_data_dir_path()

    if not (data_home / 'SignMNIST').is_dir():
        raise FileNotFoundError(
            'Download and unzip the dataset from {} in a folder SignMNIST inside {}'
            .format(
                'https://www.kaggle.com/datamunge/sign-language-mnist/home',
                data_home))

    data = np.genfromtxt(str(data_home / 'SignMNIST' / csvfiles[0]),
                         dtype=np.uint8,
                         skip_header=True,
                         delimiter=',')
    X_train = data[:, 1:]
    y_train = data[:, 0]
    data = np.genfromtxt(str(data_home / 'SignMNIST' / csvfiles[1]),
                         dtype=np.uint8,
                         skip_header=True,
                         delimiter=',')
    X_test = data[:, 1:]
    y_test = data[:, 0]

    return (X_train, y_train), (X_test, y_test)