Ejemplo n.º 1
0
def prepare_dataset(vocabulary_size, folder):
    """Downloads and preprocesses the dataset, or loads it from a pickle file."""
    pickle_file = get_pickle_filename(folder)

    if os.path.isfile(pickle_file):
        print('Pickle file already exists, assuming everything is in there.\n')
        return read_from_pickle(pickle_file)
    else:
        print('Downloading archive (if necessary)...')
        data_file = get_local_filenames(folder)
        utils.maybe_download(data_url, data_file, 31344016)

        print('Reading data from archive...')
        letters = read_data(data_file)
        words = letters.split()

        print('Converting letters to integers...')
        letters = [utils.char2id(chr(c)) for c in letters]

        print('Building dataset using a vocabulary of {0} words...'.format(
            vocabulary_size))
        data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
        del words

        print('Saving dataset...')
        save_to_pickle(letters, data, count, dictionary, reverse_dictionary, pickle_file)

        return letters, data, count, dictionary, reverse_dictionary
Ejemplo n.º 2
0
def prepare_dataset(train_size, valid_size, folder):
    """Downloads and preprocesses the dataset, or loads it from a pickle file."""
    pickle_file = get_pickle_filename(folder)

    if os.path.isfile(pickle_file):
        print('Pickle file already exists, assuming everything is in there.\n')
        return read_from_pickle(pickle_file)
    else:
        train_file, test_file = get_local_filenames(folder)

        print('Downloading / checking archives...')
        utils.maybe_download(train_data_url, train_file, train_data_file_size)
        utils.maybe_download(test_data_url, test_file, test_data_file_size)

        print('Extracting dataset...')
        train_folders = extract_file(train_file)
        test_folders = extract_file(test_file)

        print('Creating numpy dataset...')
        train_dataset, train_labels = load_from_folders(
            train_folders, min_train_images, max_train_images)
        test_dataset, test_labels = load_from_folders(test_folders,
                                                      min_test_images,
                                                      max_test_images)

        print('Randomising input...')
        train_dataset, train_labels = utils.randomize(train_dataset,
                                                      train_labels)
        test_dataset, test_labels = utils.randomize(test_dataset, test_labels)

        valid_dataset = train_dataset[:valid_size, :, :]
        valid_labels = train_labels[:valid_size]
        train_dataset = train_dataset[valid_size:valid_size + train_size, :, :]
        train_labels = train_labels[valid_size:valid_size + train_size]

        print('Saving dataset to pickle file...\n')
        save_to_pickle(train_dataset, train_labels, valid_dataset,
                       valid_labels, test_dataset, test_labels, pickle_file)

        return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
Ejemplo n.º 3
0
def prepare_dataset(train_size, valid_size, folder):
    """Downloads and preprocesses the dataset, or loads it from a pickle file."""
    pickle_file = get_pickle_filename(folder)

    if os.path.isfile(pickle_file):
        print('Pickle file already exists, assuming everything is in there.\n')
        return read_from_pickle(pickle_file)
    else:
        train_file, test_file = get_local_filenames(folder)

        print('Downloading / checking archives...')
        utils.maybe_download(train_data_url, train_file, train_data_file_size)
        utils.maybe_download(test_data_url, test_file, test_data_file_size)

        print('Extracting dataset...')
        train_folders = extract_file(train_file)
        test_folders = extract_file(test_file)

        print('Creating numpy dataset...')
        train_dataset, train_labels = load_from_folders(
                train_folders, min_train_images, max_train_images)
        test_dataset, test_labels = load_from_folders(
                test_folders, min_test_images, max_test_images)

        print('Randomising input...')
        train_dataset, train_labels = utils.randomize(train_dataset, train_labels)
        test_dataset, test_labels = utils.randomize(test_dataset, test_labels)

        valid_dataset = train_dataset[:valid_size,:,:]
        valid_labels = train_labels[:valid_size]
        train_dataset = train_dataset[valid_size:valid_size+train_size,:,:]
        train_labels = train_labels[valid_size:valid_size+train_size]

        print('Saving dataset to pickle file...\n')
        save_to_pickle(train_dataset, train_labels, valid_dataset, valid_labels,
             test_dataset, test_labels, pickle_file)

        return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels