def prepare_dataset(vocabulary_size, folder): """Downloads and preprocesses the dataset, or loads it from a pickle file.""" pickle_file = get_pickle_filename(folder) if os.path.isfile(pickle_file): print('Pickle file already exists, assuming everything is in there.\n') return read_from_pickle(pickle_file) else: print('Downloading archive (if necessary)...') data_file = get_local_filenames(folder) utils.maybe_download(data_url, data_file, 31344016) print('Reading data from archive...') letters = read_data(data_file) words = letters.split() print('Converting letters to integers...') letters = [utils.char2id(chr(c)) for c in letters] print('Building dataset using a vocabulary of {0} words...'.format( vocabulary_size)) data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size) del words print('Saving dataset...') save_to_pickle(letters, data, count, dictionary, reverse_dictionary, pickle_file) return letters, data, count, dictionary, reverse_dictionary
def prepare_dataset(train_size, valid_size, folder): """Downloads and preprocesses the dataset, or loads it from a pickle file.""" pickle_file = get_pickle_filename(folder) if os.path.isfile(pickle_file): print('Pickle file already exists, assuming everything is in there.\n') return read_from_pickle(pickle_file) else: train_file, test_file = get_local_filenames(folder) print('Downloading / checking archives...') utils.maybe_download(train_data_url, train_file, train_data_file_size) utils.maybe_download(test_data_url, test_file, test_data_file_size) print('Extracting dataset...') train_folders = extract_file(train_file) test_folders = extract_file(test_file) print('Creating numpy dataset...') train_dataset, train_labels = load_from_folders( train_folders, min_train_images, max_train_images) test_dataset, test_labels = load_from_folders(test_folders, min_test_images, max_test_images) print('Randomising input...') train_dataset, train_labels = utils.randomize(train_dataset, train_labels) test_dataset, test_labels = utils.randomize(test_dataset, test_labels) valid_dataset = train_dataset[:valid_size, :, :] valid_labels = train_labels[:valid_size] train_dataset = train_dataset[valid_size:valid_size + train_size, :, :] train_labels = train_labels[valid_size:valid_size + train_size] print('Saving dataset to pickle file...\n') save_to_pickle(train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels, pickle_file) return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
def prepare_dataset(train_size, valid_size, folder): """Downloads and preprocesses the dataset, or loads it from a pickle file.""" pickle_file = get_pickle_filename(folder) if os.path.isfile(pickle_file): print('Pickle file already exists, assuming everything is in there.\n') return read_from_pickle(pickle_file) else: train_file, test_file = get_local_filenames(folder) print('Downloading / checking archives...') utils.maybe_download(train_data_url, train_file, train_data_file_size) utils.maybe_download(test_data_url, test_file, test_data_file_size) print('Extracting dataset...') train_folders = extract_file(train_file) test_folders = extract_file(test_file) print('Creating numpy dataset...') train_dataset, train_labels = load_from_folders( train_folders, min_train_images, max_train_images) test_dataset, test_labels = load_from_folders( test_folders, min_test_images, max_test_images) print('Randomising input...') train_dataset, train_labels = utils.randomize(train_dataset, train_labels) test_dataset, test_labels = utils.randomize(test_dataset, test_labels) valid_dataset = train_dataset[:valid_size,:,:] valid_labels = train_labels[:valid_size] train_dataset = train_dataset[valid_size:valid_size+train_size,:,:] train_labels = train_labels[valid_size:valid_size+train_size] print('Saving dataset to pickle file...\n') save_to_pickle(train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels, pickle_file) return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels