def test_multiple_split_out_of_memory_list_request(self): dataset = H5PYDataset(self.h5file, which_sets=('train', 'test'), load_in_memory=False) handle = dataset.open() assert_equal(dataset.get_data(handle, list(range(30))), (self.features[:30], self.targets[:30])) dataset.close(handle)
def load_ucf101_stream(batch_size, train_size=16500, validation_size=500, test_size=100, shuffle=False): fuel_root = fuel.config.data_path[0] # データセットファイル保存場所 hdf5_filepath = os.path.join(fuel_root, 'UCF101\hdf5_dataset\hdf5_dataset.hdf5') valid_size = train_size + validation_size test_size = valid_size + test_size indices_train = range(0, train_size) indices_valid = range(train_size, valid_size) indices_test = range(valid_size, test_size) h5py_file = h5py.File(hdf5_filepath) dataset = H5PYDataset(h5py_file, ['train']) scheme_class = ShuffledScheme if shuffle else SequentialScheme scheme_train = scheme_class(indices_train, batch_size=batch_size) scheme_valid = scheme_class(indices_valid, batch_size=batch_size) scheme_test = scheme_class(indices_test, batch_size=batch_size) stream_train = DataStream(dataset, iteration_scheme=scheme_train) stream_valid = DataStream(dataset, iteration_scheme=scheme_valid) stream_test = DataStream(dataset, iteration_scheme=scheme_test) stream_train.get_epoch_iterator().next() stream_valid.get_epoch_iterator().next() stream_test.get_epoch_iterator().next() return stream_train, stream_valid, stream_test
def test_out_of_memory_example_scheme(self): dataset = H5PYDataset(self.h5file, which_sets=('train', ), load_in_memory=False) iter_ = dataset.get_example_stream().get_epoch_iterator() assert_equal(next(iter_), (self.features[0], self.targets[0])) assert_equal(next(iter_), (self.features[1], self.targets[1]))
def load_stream(batch_size=64, source=None): if source is None: raise ValueError('Source not set.') train_data = H5PYDataset(source, which_sets=('train',)) train_scheme = ShuffledScheme(examples=train_data.num_examples, batch_size=batch_size) train_stream = OneHotEncoding(DataStream(train_data, iteration_scheme=train_scheme), N_WORDS) return train_stream, train_data.num_examples
def generate(): generator = get_model(get_config("model-file")) data_path = get_config("data-file") shape = get_config("img-shape") feature_dim = get_config("feature-dim") or 40 attribute = read_feature(get_config("feature-file")) r, c = 5, 5 train_set = H5PYDataset(data_path, which_sets=('train', )) handle = train_set.open() _, real_features = train_set.get_data(handle, slice(0, 50000)) with open("feature.txt", "r") as fin: feature = fin.readline().strip().split(",") feature = choose_feature(real_features, feature) feature = np.array(np.repeat([feature], r * c, axis=0), dtype='float64') # key = feature_map["Young"] # for i in range(10): # feature[i][key-1] = 0.1 * i show_feature(feature[0]) noise = get_noise(r * c) imgs = generator.predict([noise, feature]) imgs = [convert_to_img(img) for img in imgs] figure = fill_figure(r, c, shape, imgs) save_img("result", figure)
def test_index_split_in_memory(self): features = numpy.arange(50, dtype='uint8').reshape((10, 5)) h5file = h5py.File('index_split.hdf5', mode='w', driver='core', backing_store=False) h5file['features'] = features h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['train_features_subset'] = numpy.arange(0, 10, 2) h5file['test_features_subset'] = numpy.arange(1, 10, 2) train_ref = h5file['train_features_subset'].ref test_ref = h5file['test_features_subset'].ref split_dict = { 'train': { 'features': (-1, -1, train_ref, '.') }, 'test': { 'features': (-1, -1, test_ref, '') } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = H5PYDataset(h5file, which_sets=('train', ), load_in_memory=True) handle = dataset.open() request = slice(0, 5) assert_equal(dataset.get_data(handle, request)[0], features[0:10:2]) assert_equal(dataset.num_examples, 5) dataset.close(handle)
def load_imgs_raw(ntrain=None, ntest=None, data_dir=None): t = time() print("LOADING DATASET...") path = os.path.join(data_dir) tr_data = H5PYDataset(path, which_sets=('train', )) te_data = H5PYDataset(path, which_sets=('test', )) if ntrain is None: ntrain = tr_data.num_examples if ntest is None: ntest = te_data.num_examples print('name = %s, ntrain = %d, ntest = %d\n' % (data_dir, ntrain, ntest)) print '%.2f seconds to load data' % (time() - t) return tr_data, te_data, ntrain, ntest
def test_h5py_dataset_out_of_memory(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) targets = h5file.create_dataset('targets', (10, 1), dtype='float32') targets[...] = numpy.arange(10, dtype='float32').reshape((10, 1)) split_dict = { 'train': { 'features': (0, 5), 'targets': (0, 5) }, 'test': { 'features': (5, 10), 'targets': (5, 10) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='test', load_in_memory=False) handle = dataset.open() assert_equal( dataset.get_data(state=handle, request=slice(3, 5))[1], numpy.arange(10).reshape((10, 1))[8:10]) finally: dataset.close(handle) if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def test_out_of_memory(self): dataset = H5PYDataset( self.h5file, which_set='test', load_in_memory=False) handle = dataset.open() assert_equal(dataset.get_data(handle, slice(3, 5)), (self.features[23:25], self.targets[23:25])) dataset.close(handle)
def test_out_of_memory_unsorted_indices(self): dataset = H5PYDataset( self.h5file, which_set='train', load_in_memory=False, sort_indices=False) handle = dataset.open() assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5]) dataset.close(handle)
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: datastream.default_transformers = uint8_pixels_to_floatX( ('features', )) train_stream = DataStream.default_stream( dataset=datastream, iteration_scheme=SequentialExampleScheme(datastream.num_examples)) it = train_stream.get_epoch_iterator() return it
def get_datastream(path, which_set, batch_size=1, norm_path=None, use_ivectors=False, truncate_ivectors=False, ivector_dim=100, shuffled=True): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) if shuffled: iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) else: iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) if norm_path: data_mean_std = numpy.load(norm_path) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) if use_ivectors: fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if truncate_ivectors: fs = TruncateTransformer(fs, 'ivectors', ivector_dim) # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features') else: fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) return Padding(fs)
def test_multiple_split_in_memory(self): dataset = H5PYDataset(self.h5file, which_sets=('train', 'test'), load_in_memory=True) handle = dataset.open() assert_equal(dataset.get_data(handle, slice(0, 30)), (self.features[:30], self.targets[:30])) dataset.close(handle)
def get_uttid_stream(path, which_set='test_eval92', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['uttids']) return fs
def load_swbd_dataset(setname, ratio=1.0, min_count=3): logger.info('loading data from %s' % setname) logger.info('Ratio of labeled / unlabaled: %f' % ratio) assert ratio > 0 or ratio <= 1 filename = '/home/tawara/work/ttic/data/icassp15.0/' + setname + '.hdf5' swbd = H5PYDataset(filename, which_set=("train"), load_in_memory=True) x_train = swbd.data_sources[0].astype(numpy.float32) y_train = swbd.data_sources[1][:, 0].astype(numpy.int32) ndata = x_train.shape[0] if ratio == 1.0: logger.info("Using all data: %d" % ndata) i_labeled = numpy.array(range(0, ndata)) else: n_classes = y_train.max() + 1 min_samples = sys.maxint max_samples = 0 avg_samples = 0 indices = numpy.array(range(0, ndata)) i_labeled = [] n_labeled_all = 0 for c in range(n_classes): n_samples = round(sum(y_train == c) * ratio) if round( sum(y_train == c) * ratio) >= min_count else min_count i = (indices[y_train == c])[:n_samples] i_labeled += list(i) avg_samples += n_samples min_samples = n_samples if n_samples < min_samples else min_samples max_samples = n_samples if n_samples > max_samples else max_samples n_labeled_all += n_samples logger.info('Minimum number of labeled samples: %d' % min_samples) logger.info('Maximum number of labeled samples: %d' % max_samples) logger.info('Average number of labeled samples: %d' % int(avg_samples / n_classes)) logger.info('Total number of labeled samples: %d' % n_labeled_all) logger.info('Total number of unlabeled samples: %d' % int(ndata - n_labeled_all)) swbd = H5PYDataset(filename, which_set=("dev"), load_in_memory=True) x_valid = swbd.data_sources[0].astype(numpy.float32) y_valid = swbd.data_sources[1][:, 0].astype(numpy.int32) swbd = H5PYDataset(filename, which_set=("test"), load_in_memory=True) x_test = swbd.data_sources[0].astype(numpy.float32) y_test = swbd.data_sources[1][:, 0].astype(numpy.int32) return [(x_train, y_train), (x_valid, y_valid), (x_test, y_test), i_labeled]
def test_index_subset_sorted(self): dataset = H5PYDataset( self.h5file, which_sets=('train',), subset=[0, 2, 4]) handle = dataset.open() request = slice(0, 3) assert_equal(dataset.get_data(handle, request), (self.features[[0, 2, 4]], self.targets[[0, 2, 4]])) dataset.close(handle)
def load_stream(batch_size=None, source=None): logger.info('Loading data from `{}`'.format(source)) train_data = H5PYDataset(source, which_sets=('train', )) test_data = H5PYDataset(source, which_sets=('test', )) num_train = train_data.num_examples num_test = test_data.num_examples logger.debug('Number of test examples: {}'.format(num_test)) logger.debug('Number of training examples: {}'.format(num_train)) train_scheme = ShuffledScheme(examples=num_train, batch_size=batch_size) train_stream = DataStream(train_data, iteration_scheme=train_scheme) test_scheme = ShuffledScheme(examples=num_test, batch_size=batch_size) test_stream = DataStream(test_data, iteration_scheme=test_scheme) return train_stream, num_train
def data(): try: hf["target"].shape except: hf = h5py.File('faces.hdf5', 'r+') num_samples = hf["input"].shape[0] print "number of samples in dataset : %i" % num_samples split_dict = { 'train': { 'input': (2000, num_samples), 'target': (2000, num_samples) }, 'test': { 'input': (0, 1000), 'target': (0, 1000) }, 'val': { 'input': (1000, 2000), 'target': (1000, 2000) } } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) train_set = H5PYDataset('faces.hdf5', which_sets=('train', )) test_set = H5PYDataset('faces.hdf5', which_sets=('test', )) val_set = H5PYDataset('faces.hdf5', which_sets=('val', )) batch_size = 128 #TODO : use shuffledscheme instead? Seems slower, might have screwed up the chunksize in the HDF5 files? tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size) tr_stream = DataStream(train_set, iteration_scheme=tr_scheme) val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size) val_stream = DataStream(val_set, iteration_scheme=val_scheme) test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) hf.close() return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
def get_spkid_stream(path, which_set, batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['spks']) return fs
def test_in_memory(self): dataset = H5PYDataset( self.h5file, which_set='train', load_in_memory=True) handle = dataset.open() request = slice(0, 10) assert_equal(dataset.get_data(handle, request), (self.features[request], self.targets[request])) dataset.close(handle)
def test_subset_step_gt_1(self): dataset = H5PYDataset(self.h5file, which_sets=('train', ), subset=slice(0, 10, 2)) handle = dataset.open() assert_equal( dataset.get_data(handle, [0, 1, 2, 3, 4]), (self.features[slice(0, 10, 2)], self.targets[slice(0, 10, 2)])) dataset.close(handle)
def test_index_subset_unsorted(self): # A subset should have the same ordering no matter how you specify it. dataset = H5PYDataset( self.h5file, which_sets=('train',), subset=[0, 4, 2]) handle = dataset.open() request = slice(0, 3) assert_equal(dataset.get_data(handle, request), (self.features[[0, 2, 4]], self.targets[[0, 2, 4]])) dataset.close(handle)
def test_vlen_in_memory_example_scheme(self): dataset = H5PYDataset( self.vlen_h5file, which_sets=('train',), load_in_memory=True, sort_indices=False) iter_ = dataset.get_example_stream().get_epoch_iterator() assert_equal(next(iter_), (self.vlen_features[0], self.vlen_targets[0])) assert_equal(next(iter_), (self.vlen_features[1], self.vlen_targets[1]))
def test_out_of_memory_sorted_indices(self): dataset = H5PYDataset( self.h5file, which_set='train', load_in_memory=False, sort_indices=True) handle = dataset.open() request = [7, 4, 6, 2, 5] assert_equal(dataset.get_data(handle, request), (self.features[request], self.targets[request])) dataset.close(handle)
def test_dataset_get_data_without_open(self): dataset = H5PYDataset(self.h5file, which_sets=('train', ), load_in_memory=False) try: dataset.get_data(request=(slice(0, 2))) except IOError: assert False dataset.close(None)
def get_datastream(path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def __init__(self, path, which_set='train', load_size=None, crop_size=None, dtype=numpy.float32): self._dtype = dtype self._load_size = load_size self._crop_size = crop_size self._data_set = H5PYDataset(path, which_sets=(which_set, ))
def load_dataset(data_file, load_in_memory=False): """ See ANNMINERvA/fuel_up_convdata.py for an HDF5 builder that sets up an appropriate data file. """ if os.path.exists(data_file): train_set = H5PYDataset(data_file, which_sets=('train', ), load_in_memory=load_in_memory) valid_set = H5PYDataset(data_file, which_sets=('valid', ), load_in_memory=load_in_memory) test_set = H5PYDataset(data_file, which_sets=('test', ), load_in_memory=load_in_memory) else: raise Exception('Data file', data_file, 'not found!') return train_set, valid_set, test_set
def train1(model=None): if model is not None: trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets')) testset = H5PYDataset('svhn_format_2.hdf5', which_sets=('test',), sources=('features', 'targets')) batch_size = 500 epochs_to_wait_for_improve = 1 csv_logger = keras.callbacks.CSVLogger('traininglog.csv') check_point = keras.callbacks.ModelCheckpoint("model3epochweights.h5", monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=True, mode='auto', period=1) early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve) history = model.fit_generator(dataset_generator(trainset, batch_size), steps_per_epoch=np.ceil(trainset.num_examples/batch_size), epochs=15, verbose=2, callbacks=[csv_logger, check_point, early_stopping], validation_data=dataset_generator(testset, batch_size), validation_steps=np.ceil(testset.num_examples/batch_size)) #print accuracy return history
def __init__(self, bs, sources, filename, which_sets, **kwargs): self.bs = bs self.provides_sources = sources self.filename = filename super(UniformDataset, self).__init__(**kwargs) self.train_set = H5PYDataset(self.filename, which_sets=which_sets, load_in_memory=True) self.handle = self.train_set.open() self.num_examples = self.train_set.num_examples