def print_digit_labels(digit_labels): nb_digits = len(digit_labels) nb_rows = int(floor(sqrt(nb_digits))) nb_cols = int(ceil(nb_digits / nb_rows)) a = array([nb_cols * [''] for _ in range(nb_rows)], dtype=object) k = -1 for i, j in product(range(nb_rows), range(nb_cols)): k += 1 if k < nb_digits: a[i, j] = digit_labels[k] set_printoptions(threshold=nan) printflush(a)
def parse_movielens_20m_data( data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___MovieLens___20M/master'): printflush('Parsing Movies...', end=' ') movies = read_csv(join(data_path, 'movies.csv')) printflush('done!') NB_RATINGS_FILES = 10 printflush('Parsing %i Ratings Files...' % NB_RATINGS_FILES, end=' ') ratings = [] for i in range(NB_RATINGS_FILES): ratings_file_name = 'ratings%02d.csv' % (i + 1) ratings.append(read_csv(join(data_path, ratings_file_name))) printflush(ratings_file_name, end=', ') printflush('done!') ratings = concat(ratings, ignore_index=True) return dict(movies=movies, ratings=ratings)
def parse_MNIST_digits( data_path='https://github.com/ChicagoBoothML/DATA___LeCun___MNISTDigits', digits=arange(10)): try: status_code = head(data_path).status_code valid_url = status_code < 400 except: valid_url = False if valid_url: printflush('Cloning Data Repository ', data_path, ' to Temporary Folder... ', sep='', end='') temp_dir = mkdtemp() Repo.clone_from(data_path, temp_dir) data_path = temp_dir to_delete = True printflush('done!') else: to_delete = False printflush('Parsing Data Set "MNIST Hand-Written Digits"... ', end='') f = open(join(data_path, 'train-images.idx3-ubyte'), 'rb') magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16)) nb_pixels = nb_rows * nb_cols images = array(py_array("B", f.read())) f.close() f = open(join(data_path, 'train-labels.idx1-ubyte'), 'rb') magic_number, nb_digits = unpack(">II", f.read(8)) labels = array(py_array("b", f.read())) f.close() indices = [i for i in range(nb_digits) if labels[i] in digits] nb_digits = len(indices) train_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8) train_labels = zeros((nb_digits, ), dtype=int8) for i in range(nb_digits): index = indices[i] train_images[i] = array( images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape( (nb_rows, nb_cols)) train_labels[i] = labels[index] f = open(join(data_path, 't10k-images.idx3-ubyte'), 'rb') magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16)) nb_pixels = nb_rows * nb_cols images = array(py_array("B", f.read())) f.close() f = open(join(data_path, 't10k-labels.idx1-ubyte'), 'rb') magic_number, nb_digits = unpack(">II", f.read(8)) labels = array(py_array("b", f.read())) f.close() indices = [i for i in range(nb_digits) if labels[i] in digits] nb_digits = len(indices) test_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8) test_labels = zeros((nb_digits, ), dtype=int8) for i in range(nb_digits): index = indices[i] test_images[i] = array( images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape( (nb_rows, nb_cols)) test_labels[i] = labels[index] printflush('done!') if to_delete: printflush('Deleting Temporary Data Folder... ', end='') for root, dirs, files in walk(data_path, topdown=False): for file in files: file_path = join(root, file) chmod(file_path, S_IWUSR) remove(file_path) for dir in dirs: rmdir(join(root, dir)) rmdir(data_path) printflush('done!') return dict(TrainImages=train_images, TrainLabels=train_labels, TestImages=test_images, TestLabels=test_labels)
def parse_human_activity_recog_data( data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___UCI___HumanActivityRecognitionUsingSmartphones/master', X_names_file_name='features.txt', train_subfolder_name='train', X_train_file_name='X_train.txt', y_train_file_name='y_train.txt', test_subfolder_name='test', X_test_file_name='X_test.txt', y_test_file_name='y_test.txt'): """ :param data_path: path of folder containing "Human Activity Recognition Using Smartphones" data set :param X_names_file_name: name of file containing X features' names :param train_subfolder_name: name of folder containing Training data :param X_train_file_name: train_X matrix file :param y_train_file_name: train_y labels in integer :param test_subfolder_name: name of folder containing Test data :param X_test_file_name: test_X matrix file :param y_test_file_name: test_y labels in integer :return: dict with the following: feature_names: list of names of the X features train_X: data frame containing the X features for model training train_y: data frame containing class labels for model training test_X: data frame containing X features for model testing test_y: data frame containing class labels for model testing """ printflush('Parsing Data Set "UCI Human Activity Recognition Using Smartphones"...') printflush(" Parsing Unique Input Features' (X's) Names... ", end='') X_names_with_duplicates = read_csv( join(data_path, X_names_file_name), delim_whitespace=True, header=None, index_col=0).iloc[:, 0] X_name_counts = Counter(X_names_with_duplicates) X_unique_names = list(X_name_counts) X_unique_names.sort() printflush('done!') printflush(' Parsing Train & Test Input Feature Data Sets... ', end='') X_train_with_duplicates = read_csv( join(data_path, train_subfolder_name, X_train_file_name), delim_whitespace=True, header=None, index_col=False, names=X_names_with_duplicates, dtype=float, error_bad_lines=False, warn_bad_lines=True) X_test_with_duplicates = read_csv( join(data_path, test_subfolder_name, X_test_file_name), delim_whitespace=True, header=None, index_col=False, names=X_names_with_duplicates, dtype=float, error_bad_lines=False, warn_bad_lines=True) X_train = DataFrame(index=X_train_with_duplicates.index) X_test = DataFrame(index=X_test_with_duplicates.index) for x_name in X_unique_names: if X_name_counts[x_name] == 1: X_train[x_name] = X_train_with_duplicates[x_name] X_test[x_name] = X_test_with_duplicates[x_name] else: X_train[x_name] = X_train_with_duplicates[x_name].iloc[:, 0] X_test[x_name] = X_test_with_duplicates[x_name].iloc[:, 0] printflush('done!') printflush(' Removing Input Feature Data Rows with Missing (NaN) Values... ', end='') for i in X_train.index: if isnan(X_train.loc[i].sum()): X_train.drop(i, inplace=True) for i in X_test.index: if isnan(X_test.loc[i].sum()): X_test.drop(i, inplace=True) printflush('done!') printflush(' Parsing Train & Test Labels (y)... ', end='') y_class_labels = 'Walking', 'WalkingUpstairs', 'WalkingDownstairs', 'Sitting', 'Standing', 'Laying' y_train = read_csv( join(data_path, train_subfolder_name, y_train_file_name), sep=' ', header=None, dtype=int).iloc[:, 0].astype('category') y_train.cat.rename_categories(y_class_labels, inplace=True) y_train = y_train.loc[X_train.index] y_test = read_csv( join(data_path, test_subfolder_name, y_test_file_name), sep=' ', header=None, dtype=int).iloc[:, 0].astype('category') y_test.cat.rename_categories(y_class_labels, inplace=True) y_test = y_test.loc[X_test.index] printflush('done!') return dict(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
def parse_MNIST_digits(data_path='https://github.com/ChicagoBoothML/DATA___LeCun___MNISTDigits', digits=arange(10)): try: status_code = head(data_path).status_code valid_url = status_code < 400 except: valid_url = False if valid_url: printflush('Cloning Data Repository ', data_path, ' to Temporary Folder... ', sep='', end='') temp_dir = mkdtemp() Repo.clone_from(data_path, temp_dir) data_path = temp_dir to_delete = True printflush('done!') else: to_delete = False printflush('Parsing Data Set "MNIST Hand-Written Digits"... ', end='') f = open(join(data_path, 'train-images.idx3-ubyte'), 'rb') magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16)) nb_pixels = nb_rows * nb_cols images = array(py_array("B", f.read())) f.close() f = open(join(data_path, 'train-labels.idx1-ubyte'), 'rb') magic_number, nb_digits = unpack(">II", f.read(8)) labels = array(py_array("b", f.read())) f.close() indices = [i for i in range(nb_digits) if labels[i] in digits] nb_digits = len(indices) train_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8) train_labels = zeros((nb_digits,), dtype=int8) for i in range(nb_digits): index = indices[i] train_images[i] = array(images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape((nb_rows, nb_cols)) train_labels[i] = labels[index] f = open(join(data_path, 't10k-images.idx3-ubyte'), 'rb') magic_number, nb_digits, nb_rows, nb_cols = unpack(">IIII", f.read(16)) nb_pixels = nb_rows * nb_cols images = array(py_array("B", f.read())) f.close() f = open(join(data_path, 't10k-labels.idx1-ubyte'), 'rb') magic_number, nb_digits = unpack(">II", f.read(8)) labels = array(py_array("b", f.read())) f.close() indices = [i for i in range(nb_digits) if labels[i] in digits] nb_digits = len(indices) test_images = zeros((nb_digits, nb_rows, nb_cols), dtype=uint8) test_labels = zeros((nb_digits,), dtype=int8) for i in range(nb_digits): index = indices[i] test_images[i] = array(images[(index * nb_pixels):((index + 1) * nb_pixels)]).reshape((nb_rows, nb_cols)) test_labels[i] = labels[index] printflush('done!') if to_delete: printflush('Deleting Temporary Data Folder... ', end='') for root, dirs, files in walk(data_path, topdown=False): for file in files: file_path = join(root, file) chmod(file_path, S_IWUSR) remove(file_path) for dir in dirs: rmdir(join(root, dir)) rmdir(data_path) printflush('done!') return dict(TrainImages=train_images, TrainLabels=train_labels, TestImages=test_images, TestLabels=test_labels)
def parse_book_crossing_data( data_path='https://raw.githubusercontent.com/ChicagoBoothML/DATA___BookCrossing/master'): # Common NAs: na_strings = [ '', 'na', 'n.a', 'n.a.', 'nan', 'n.a.n', 'n.a.n.', 'NA', 'N.A', 'N.A.', 'NaN', 'N.a.N', 'N.a.N.', 'NAN', 'N.A.N', 'N.A.N.', 'nil', 'Nil', 'NIL', 'null', 'Null', 'NULL'] printflush('Parsing Books...', end=' ') books = read_csv( join(data_path, 'BX-Books.csv'), sep=';', dtype=str, na_values=na_strings, usecols=['ISBN', 'Book-Title', 'Book-Author'], error_bad_lines=False) printflush('done!') printflush('Parsing Users...', end=' ') users = read_csv( join(data_path, 'BX-Users.csv'), sep=';', dtype=str, na_values=na_strings, error_bad_lines=False) users['User-ID'] = users['User-ID'].astype(int) users['Age'] = users['Age'].astype(float) printflush('done!') printflush('Parsing Ratings...', end=' ') ratings = read_csv( join(data_path, 'BX-Book-Ratings.csv'), sep=';', dtype=str, na_values=na_strings, error_bad_lines=False) ratings['User-ID'] = ratings['User-ID'].astype(int) ratings['Book-Rating'] = ratings['Book-Rating'].astype(float) printflush('done!') return dict(books=books, users=users, ratings=ratings)