Beispiel #1
0
def print_digit_labels(digit_labels):
    nb_digits = len(digit_labels)
    nb_rows = int(floor(sqrt(nb_digits)))
    nb_cols = int(ceil(nb_digits / nb_rows))
    a = array([nb_cols * [''] for _ in range(nb_rows)], dtype=object)
    k = -1
    for i, j in product(range(nb_rows), range(nb_cols)):
        k += 1
        if k < nb_digits:
            a[i, j] = digit_labels[k]
    set_printoptions(threshold=nan)
    printflush(a)
Beispiel #2
0
def parse_movielens_20m_data(
        data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___MovieLens___20M/master'):

    printflush('Parsing Movies...', end=' ')
    movies = read_csv(join(data_path, 'movies.csv'))
    printflush('done!')

    NB_RATINGS_FILES = 10
    printflush('Parsing %i Ratings Files...' % NB_RATINGS_FILES, end=' ')
    ratings = []
    for i in range(NB_RATINGS_FILES):
        ratings_file_name = 'ratings%02d.csv' % (i + 1)
        ratings.append(read_csv(join(data_path, ratings_file_name)))
        printflush(ratings_file_name, end=', ')
    printflush('done!')
    ratings = concat(ratings, ignore_index=True)

    return dict(movies=movies, ratings=ratings)
Beispiel #3
0
def parse_MNIST_digits(
    data_path='https://github.com/ChicagoBoothML/DATA___LeCun___MNISTDigits',
    digits=arange(10)):

    try:
        status_code = head(data_path).status_code
        valid_url = status_code < 400
    except:
        valid_url = False

    if valid_url:
        printflush('Cloning Data Repository ',
                   data_path,
                   ' to Temporary Folder... ',
                   sep='',
                   end='')
        temp_dir = mkdtemp()
        Repo.clone_from(data_path, temp_dir)
        data_path = temp_dir
        to_delete = True
        printflush('done!')
    else:
        to_delete = False

    printflush('Parsing Data Set "MNIST Hand-Written Digits"... ', end='')

    f = open(join(data_path, 'train-images.idx3-ubyte'), 'rb')
    magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16))
    nb_pixels = nb_rows * nb_cols
    images = array(py_array("B", f.read()))
    f.close()

    f = open(join(data_path, 'train-labels.idx1-ubyte'), 'rb')
    magic_number, nb_digits = unpack(">II", f.read(8))
    labels = array(py_array("b", f.read()))
    f.close()

    indices = [i for i in range(nb_digits) if labels[i] in digits]
    nb_digits = len(indices)

    train_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8)
    train_labels = zeros((nb_digits, ), dtype=int8)
    for i in range(nb_digits):
        index = indices[i]
        train_images[i] = array(
            images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape(
                (nb_rows, nb_cols))
        train_labels[i] = labels[index]

    f = open(join(data_path, 't10k-images.idx3-ubyte'), 'rb')
    magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16))
    nb_pixels = nb_rows * nb_cols
    images = array(py_array("B", f.read()))
    f.close()

    f = open(join(data_path, 't10k-labels.idx1-ubyte'), 'rb')
    magic_number, nb_digits = unpack(">II", f.read(8))
    labels = array(py_array("b", f.read()))
    f.close()

    indices = [i for i in range(nb_digits) if labels[i] in digits]
    nb_digits = len(indices)

    test_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8)
    test_labels = zeros((nb_digits, ), dtype=int8)
    for i in range(nb_digits):
        index = indices[i]
        test_images[i] = array(
            images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape(
                (nb_rows, nb_cols))
        test_labels[i] = labels[index]

    printflush('done!')

    if to_delete:
        printflush('Deleting Temporary Data Folder... ', end='')
        for root, dirs, files in walk(data_path, topdown=False):
            for file in files:
                file_path = join(root, file)
                chmod(file_path, S_IWUSR)
                remove(file_path)
            for dir in dirs:
                rmdir(join(root, dir))
        rmdir(data_path)
        printflush('done!')

    return dict(TrainImages=train_images,
                TrainLabels=train_labels,
                TestImages=test_images,
                TestLabels=test_labels)
def parse_human_activity_recog_data(
        data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___UCI___HumanActivityRecognitionUsingSmartphones/master',
        X_names_file_name='features.txt',
        train_subfolder_name='train', X_train_file_name='X_train.txt', y_train_file_name='y_train.txt',
        test_subfolder_name='test', X_test_file_name='X_test.txt', y_test_file_name='y_test.txt'):
    """
    :param data_path: path of folder containing "Human Activity Recognition Using Smartphones" data set
    :param X_names_file_name: name of file containing X features' names
    :param train_subfolder_name: name of folder containing Training data
    :param X_train_file_name: train_X matrix file
    :param y_train_file_name: train_y labels in integer
    :param test_subfolder_name: name of folder containing Test data
    :param X_test_file_name: test_X matrix file
    :param y_test_file_name: test_y labels in integer
    :return: dict with the following:
    feature_names: list of names of the X features
    train_X: data frame containing the X features for model training
    train_y: data frame containing class labels for model training
    test_X: data frame containing X features for model testing
    test_y: data frame containing class labels for model testing
    """

    printflush('Parsing Data Set "UCI Human Activity Recognition Using Smartphones"...')

    printflush("   Parsing Unique Input Features' (X's) Names... ", end='')
    X_names_with_duplicates = read_csv(
        join(data_path, X_names_file_name),
        delim_whitespace=True, header=None, index_col=0).iloc[:, 0]
    X_name_counts = Counter(X_names_with_duplicates)
    X_unique_names = list(X_name_counts)
    X_unique_names.sort()
    printflush('done!')

    printflush('   Parsing Train & Test Input Feature Data Sets... ', end='')
    X_train_with_duplicates = read_csv(
        join(data_path, train_subfolder_name, X_train_file_name),
        delim_whitespace=True, header=None, index_col=False,
        names=X_names_with_duplicates,
        dtype=float, error_bad_lines=False, warn_bad_lines=True)
    X_test_with_duplicates = read_csv(
        join(data_path, test_subfolder_name, X_test_file_name),
        delim_whitespace=True, header=None, index_col=False,
        names=X_names_with_duplicates,
        dtype=float, error_bad_lines=False, warn_bad_lines=True)
    X_train = DataFrame(index=X_train_with_duplicates.index)
    X_test = DataFrame(index=X_test_with_duplicates.index)
    for x_name in X_unique_names:
        if X_name_counts[x_name] == 1:
            X_train[x_name] = X_train_with_duplicates[x_name]
            X_test[x_name] = X_test_with_duplicates[x_name]
        else:
            X_train[x_name] = X_train_with_duplicates[x_name].iloc[:, 0]
            X_test[x_name] = X_test_with_duplicates[x_name].iloc[:, 0]
    printflush('done!')

    printflush('   Removing Input Feature Data Rows with Missing (NaN) Values... ', end='')
    
    for i in X_train.index:
        if isnan(X_train.loc[i].sum()):
            X_train.drop(i, inplace=True)
    for i in X_test.index:
        if isnan(X_test.loc[i].sum()):
            X_test.drop(i, inplace=True)
    printflush('done!')

    printflush('   Parsing Train & Test Labels (y)... ', end='')

    y_class_labels = 'Walking', 'WalkingUpstairs', 'WalkingDownstairs', 'Sitting', 'Standing', 'Laying'

    y_train = read_csv(
        join(data_path, train_subfolder_name, y_train_file_name),
        sep=' ', header=None, dtype=int).iloc[:, 0].astype('category')
    y_train.cat.rename_categories(y_class_labels, inplace=True)
    y_train = y_train.loc[X_train.index]

    y_test = read_csv(
        join(data_path, test_subfolder_name, y_test_file_name),
        sep=' ', header=None, dtype=int).iloc[:, 0].astype('category')
    y_test.cat.rename_categories(y_class_labels, inplace=True)
    y_test = y_test.loc[X_test.index]
    printflush('done!')

    return dict(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
def parse_MNIST_digits(data_path='https://github.com/ChicagoBoothML/DATA___LeCun___MNISTDigits', digits=arange(10)):

    try:
        status_code = head(data_path).status_code
        valid_url = status_code < 400
    except:
        valid_url = False

    if valid_url:
        printflush('Cloning Data Repository ', data_path, ' to Temporary Folder... ', sep='', end='')
        temp_dir = mkdtemp()
        Repo.clone_from(data_path, temp_dir)
        data_path = temp_dir
        to_delete = True
        printflush('done!')
    else:
        to_delete = False

    printflush('Parsing Data Set "MNIST Hand-Written Digits"... ', end='')

    f = open(join(data_path, 'train-images.idx3-ubyte'), 'rb')
    magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16))
    nb_pixels = nb_rows * nb_cols
    images = array(py_array("B", f.read()))
    f.close()

    f = open(join(data_path, 'train-labels.idx1-ubyte'), 'rb')
    magic_number, nb_digits = unpack(">II", f.read(8))
    labels = array(py_array("b", f.read()))
    f.close()

    indices = [i for i in range(nb_digits) if labels[i] in digits]
    nb_digits = len(indices)

    train_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8)
    train_labels = zeros((nb_digits,), dtype=int8)
    for i in range(nb_digits):
        index = indices[i]
        train_images[i] = array(images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape((nb_rows, nb_cols))
        train_labels[i] = labels[index]

    f = open(join(data_path, 't10k-images.idx3-ubyte'), 'rb')
    magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16))
    nb_pixels = nb_rows * nb_cols
    images = array(py_array("B", f.read()))
    f.close()

    f = open(join(data_path, 't10k-labels.idx1-ubyte'), 'rb')
    magic_number, nb_digits = unpack(">II", f.read(8))
    labels = array(py_array("b", f.read()))
    f.close()

    indices = [i for i in range(nb_digits) if labels[i] in digits]
    nb_digits = len(indices)

    test_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8)
    test_labels = zeros((nb_digits,), dtype=int8)
    for i in range(nb_digits):
        index = indices[i]
        test_images[i] = array(images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape((nb_rows, nb_cols))
        test_labels[i] = labels[index]

    printflush('done!')

    if to_delete:
        printflush('Deleting Temporary Data Folder... ', end='')
        for root, dirs, files in walk(data_path, topdown=False):
            for file in files:
                file_path = join(root, file)
                chmod(file_path, S_IWUSR)
                remove(file_path)
            for dir in dirs:
                rmdir(join(root, dir))
        rmdir(data_path)
        printflush('done!')

    return dict(TrainImages=train_images, TrainLabels=train_labels, TestImages=test_images, TestLabels=test_labels)
Beispiel #6
0
def parse_book_crossing_data(
        data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___BookCrossing/master'):

    # Common NAs:
    na_strings = [
        '',
        'na', 'n.a', 'n.a.',
        'nan', 'n.a.n', 'n.a.n.',
        'NA', 'N.A', 'N.A.',
        'NaN', 'N.a.N', 'N.a.N.',
        'NAN', 'N.A.N', 'N.A.N.',
        'nil', 'Nil', 'NIL',
        'null', 'Null', 'NULL']

    printflush('Parsing Books...', end=' ')
    books = read_csv(
        join(data_path, 'BX-Books.csv'),
        sep=';',
        dtype=str,
        na_values=na_strings,
        usecols=['ISBN', 'Book-Title', 'Book-Author'],
        error_bad_lines=False)
    printflush('done!')

    printflush('Parsing Users...', end=' ')
    users = read_csv(
        join(data_path, 'BX-Users.csv'),
        sep=';',
        dtype=str,
        na_values=na_strings,
        error_bad_lines=False)
    users['User-ID'] = users['User-ID'].astype(int)
    users['Age'] = users['Age'].astype(float)
    printflush('done!')

    printflush('Parsing Ratings...', end=' ')
    ratings = read_csv(
        join(data_path, 'BX-Book-Ratings.csv'),
        sep=';',
        dtype=str,
        na_values=na_strings,
        error_bad_lines=False)
    ratings['User-ID'] = ratings['User-ID'].astype(int)
    ratings['Book-Rating'] = ratings['Book-Rating'].astype(float)
    printflush('done!')

    return dict(books=books, users=users, ratings=ratings)