def setUp(self): self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) h5file = h5py.File( 'file.hdf5', mode='w', driver='core', backing_store=False) h5file['features'] = self.features h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'] = self.targets h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)}, 'test': {'features': (20, 30), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100, None, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.h5file = h5file vlen_h5file = h5py.File( 'test_vl.hdf5', mode='w', driver='core', backing_store=False) self.vlen_features = [ numpy.arange(12, dtype='uint8').reshape((3, 2, 2)), numpy.arange(48, dtype='uint8').reshape((3, 4, 4)), numpy.arange(60, dtype='uint8').reshape((3, 5, 4)), numpy.arange(18, dtype='uint8').reshape((3, 2, 3))] self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1)) dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) features = vlen_h5file.create_dataset('features', (4,), dtype=dtype) features[...] = [d.flatten() for d in self.vlen_features] features.dims[0].label = 'batch' features_shapes = vlen_h5file.create_dataset( 'features_shapes', (4, 3), dtype='uint8') features_shapes[...] = numpy.array( [d.shape for d in self.vlen_features]) features.dims.create_scale(features_shapes, 'shapes') features.dims[0].attach_scale(features_shapes) features_shape_labels = vlen_h5file.create_dataset( 'features_shape_labels', (3,), dtype='S7') features_shape_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] features.dims.create_scale(features_shape_labels, 'shape_labels') features.dims[0].attach_scale(features_shape_labels) targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8') targets[...] = self.vlen_targets targets.dims[0].label = 'batch' targets.dims[1].label = 'index' split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}} vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.vlen_h5file = vlen_h5file
def createH5Dataset(hdf5_out, corpus_path, sequence_length): with open(corpus_path) as f: corpus = f.read().split(",") (indices, vocab) = pd.factorize(list(corpus)) instances_num = len(corpus) // (sequence_length + 1) f = h5py.File(hdf5_out, mode='w') train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8) train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8) for j in range(instances_num): for i in range(sequence_length): train_data_x[j][i] = indices[i + j * (sequence_length + 1)] train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1] char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8') char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8') char_in[...] = train_data_x char_out[...] = train_data_y split_dict = { 'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.attrs["vocab"] = json.dumps(list(vocab)) f.flush() f.close()
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size): [rate, signal] = wav.read(wav_name) num_steps = signal.shape[0] num_seqs = num_steps-window_size output_path = '{}.hdf5'.format(hdf5_name) output_path = os.path.join(output_path) signal = signal.reshape(num_steps,1) with h5py.File(output_path, mode='w') as h5file: input_feature = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16') target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16') print ' num of sequences : {}'.format(num_seqs) for s in xrange(num_seqs): input_feature[s] = signal[s:s+window_size] target_feature[s] = signal[(s+1):(s+1)+window_size] # label each dataset axis input_feature.dims[0].label = 'batch' input_feature.dims[1].label = 'time' input_feature.dims[2].label = 'feature' target_feature.dims[0].label = 'batch' target_feature.dims[1].label = 'time' target_feature.dims[2].label = 'feature' split_dict = {'train': {'input_feature' : ( 0, num_seqs), 'target_feature': ( 0, num_seqs)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return num_seqs
def save_h5py(tn, start, stop): cf = train_features[start:stop] ct = train_targets[start:stop] np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf) np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct) h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w') h5_features = h5.create_dataset( 'features', (cf.shape[0], cf.shape[1]*mult) , dtype='float32') lenf = stop - start with ProgressBar(maxval=lenf) as progbar: for i in range(lenf): arr = [] for j in range(-concat[0], concat[0]+1, concat[1]): arr.extend(cf[(i-j)%lenf]) h5_features[i] = np.asarray(arr) progbar.update(i) h5_targets = h5.create_dataset( 'targets', ct.shape, dtype='uint16') h5_targets[...] = ct h5_features.dims[0].label = 'batch' h5_features.dims[1].label = 'feature' h5_targets.dims[0].label = 'batch' h5_targets.dims[1].label = 'index' split_dict = { tn: {'features': (0, stop-start), 'targets': (0, stop-start)}, #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))}, } h5.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5.flush() h5.close()
def CreateHDF5(): sizes = numpy.random.randint(3,9, size=(100,)) train_image_features = [ numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]] test_image_features = [ numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]] train_vector_features = numpy.random.normal(size=(90,10)).astype('float32') test_vector_features = numpy.random.normal(size=(10,10)).astype('float32') train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8') test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8') f = h5py.File('dataset.hdf5', mode='w') vector_features = f.create_dataset( 'vector_features', (100, 10), dtype='float32') targets = f.create_dataset( 'targets', (100, 1), dtype='uint8') vector_features[...] = numpy.vstack( [train_vector_features, test_vector_features]) targets[...] = numpy.vstack([train_targets, test_targets]) vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' all_image_features = train_image_features + test_image_features dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) image_features = f.create_dataset('image_features', (100,), dtype=dtype) image_features[...] = [image.flatten() for image in all_image_features] image_features.dims[0].label='batch' image_features_shapes = f.create_dataset( 'image_features_shapes', (100, 3), dtype='int32') image_features_shapes[...] = numpy.array( [image.shape for image in all_image_features]) image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset( 'image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] image_features.dims.create_scale( image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) split_dict = { 'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)}, 'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_split(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 8)}, 'test': {'features': (8, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() train_set = H5PYDataset(path='tmp.hdf5', which_set='train') test_set = H5PYDataset(path='tmp.hdf5', which_set='test') train_handle = train_set.open() test_handle = test_set.open() assert_equal( train_set.get_data(state=train_handle, request=slice(0, 8))[0], numpy.arange(50).reshape((10, 5))[:8]) assert_equal( test_set.get_data(state=test_handle, request=slice(0, 2))[0], numpy.arange(50).reshape((10, 5))[8:]) train_set.close(train_handle) test_set.close(test_handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def add_sets(args): with h5py.File(args.h5file, "a") as h5file: sources = [] for dataset in h5file: if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"): continue sources.append(dataset) uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])} split_dict = {} for subset in args.sets: name, uttids_fname = subset.split("=") idxs = [] with open(uttids_fname) as uf: for l in uf: uttid = l.strip().split()[0] idxs.append(uttid2idx[uttid]) indices_name = "{}_indices".format(name) if indices_name in h5file: del h5file[indices_name] # # Note: ideally, we would sort the indeces and do: # h5file[indices_name] = numpy.array(sorted(idxs)) # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid! # h5file[indices_name] = numpy.array(idxs) indices_ref = h5file[indices_name].ref split_dict[name] = {source: (-1, -1, indices_ref) for source in sources} h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
def data(): try: hf["target"].shape except: hf = h5py.File('faces.hdf5','r+') num_samples = hf["input"].shape[0] print "number of samples in dataset : %i" %num_samples split_dict = { 'train': {'input': (2000, num_samples), 'target': (2000, num_samples)}, 'test': {'input': (0, 1000), 'target': (0, 1000)}, 'val': {'input': (1000, 2000), 'target': (1000, 2000)} } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) train_set = H5PYDataset('faces.hdf5', which_sets=('train',)) test_set = H5PYDataset('faces.hdf5', which_sets=('test',)) val_set = H5PYDataset('faces.hdf5', which_sets=('val',)) batch_size = 128 #TODO : use shuffledscheme instead? Seems slower, might have screwed up the chunksize in the HDF5 files? tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size) tr_stream = DataStream(train_set, iteration_scheme=tr_scheme) val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size) val_stream = DataStream(val_set, iteration_scheme=val_scheme) test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) hf.close() return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
def save_hd5py(dataset, destfile, indices_dict): f = h5py.File(destfile, mode='w') images = f.create_dataset('images', dataset.shape, dtype='uint8') images[...] = dataset split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_value_error_on_unequal_sources(self): def get_subsets(): return H5PYDataset(self.h5file, which_sets=('train',)).subsets split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)}, 'test': {'features': (20, 30), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100, None, '.')}} self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) assert_raises(ValueError, get_subsets)
def biblefile_to_hdf5(open_file): # TODO REMOVE LINES WITH THE BOOK OF BLABLA """Everything in one function because we have variable-length sequences, so no intermediate arrays...""" char_to_ind = {"<S>": 0, "</S>": 1} current_char_ind = 2 # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character all_verses = [] # TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence current_verse = [] for line in open_file: # first we need to check if a new verse begins somewhere in the line (not just beginning...) verse_marker_pos = find_verse_marker(line) if len(line.split()) > 0 and verse_marker_pos > -1: # if so, save the verse up to the verse marker and start a new one from the rest of the line current_verse += list(line[:verse_marker_pos]) # also replace all characters by integers, creating more mappings if necessary for (ind, char) in enumerate(current_verse): if char not in char_to_ind: char_to_ind[char] = current_char_ind current_char_ind += 1 current_verse[ind] = char_to_ind[char] current_verse.append(1) # for sequence generator we need to explicitly append this end-of-sequence char all_verses.append(numpy.array(current_verse, dtype="int32")) current_verse = list(line[verse_marker_pos:]) # otherwise, just put everything into the current verse else: current_verse += list(line) all_verses = numpy.array(all_verses) # I think this conversion is necessary for the indexing below? # at this point we have all our verses =) now we build our .hdf5 dataset # make a little validation set val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500) test_set = list(all_verses[val_indices]) train_set = list(numpy.delete(all_verses, val_indices, 0)) # if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part) f = h5py.File(name="bible.hdf5", mode="w") dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32")) character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int) character_seqs[...] = train_set + test_set split_dict = {"train": {"character_seqs": (0, len(train_set))}, "valid": {"character_seqs": (len(train_set), len(all_verses))}} f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() # we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file numpy.save("onehot_size.npy", current_char_ind) # also the word-to-index dict cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w")) # make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output # later ind_to_char = [""]*len(char_to_ind) ind_to_char[0] = "<S>" ind_to_char[1] = "</S>" for char in char_to_ind: ind_to_char[char_to_ind[char]] = char cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
def build_hdf5_dataset(input_filename, output_filename,batch_size=64): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #print "Sample from data: {}".format(data[70]) #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 batch_index_train = int(0.9 * train_valid_length / float(batch_size)) batch_index_valid = int(train_valid_length / float(batch_size)) batch_index_test = int(data_length / float(batch_size)) print "batch indices in order : {}".format((batch_index_train, batch_index_valid, batch_index_test)) assert(train_valid_length == batch_index_valid * batch_size) data = data.reshape(data_length)[:batch_index_test*batch_size] data = data.reshape(batch_index_test,batch_size,1) print data.shape print ("values lost: {}").format(data_length - data.size) test_length = data_length - train_valid_length features = output_file.create_dataset( name='features' , shape=data.shape, dtype='int16', data=data) features.dims[0].label = 'batch' features.dims[1].label = 'time' features.dims[2].label = 'feature' split_dict = { 'train': { 'features' : (0, batch_index_train)}, 'valid': { 'features' : (batch_index_train + 1, batch_index_valid)}, 'test': { 'features' : (batch_index_valid + 1,batch_index_test)} } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def save_hd5py(dataset_dict, destfile, indices_dict): f = h5py.File(destfile, mode='w') for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k,v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size): data_stream = YouTubeAudio(youtube_id).get_example_stream() data_stream = Window(offset=interval_size, source_window=interval_size*window_size, target_window=interval_size*window_size, overlapping=True, data_stream=data_stream) data_iterator = data_stream.get_epoch_iterator() num_sequences = 0 for data in data_iterator: num_sequences = num_sequences + 1 output_path = '{}.hdf5'.format(hdf5_name) output_path = os.path.join(output_path) print 'total num sequences : ', num_sequences with h5py.File(output_path, mode='w') as h5file: input_feature = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16') target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16') data_iterator = data_stream.get_epoch_iterator() # for each batch for s_idx, sequence_data in enumerate(data_iterator): # get data source_data = sequence_data[0] target_data = sequence_data[1] # save data input_feature[s_idx] = source_data.reshape(window_size, interval_size) target_feature[s_idx] = target_data.reshape(window_size, interval_size) # label each dataset axis input_feature.dims[0].label = 'batch' input_feature.dims[1].label = 'time' input_feature.dims[2].label = 'feature' target_feature.dims[0].label = 'batch' target_feature.dims[1].label = 'time' target_feature.dims[2].label = 'feature' num_trains = int(num_sequences*0.8) split_dict = {'train': {'input_feature' : ( 0, num_trains), 'target_feature': ( 0, num_trains)}, 'valid': {'input_feature' : ( num_trains, num_sequences), 'target_feature': ( num_trains, num_sequences)}, } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return num_sequences
def build_hdf5_dataset_single_dim(input_filename, output_filename): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. This function outputs a single dimension for the datasets. Adapted to monk_music """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 index_train = int(0.9 * train_valid_length) index_valid = int(train_valid_length) index_test = int(data_length) print "batch indices in order : {}".format((index_train, index_valid, index_test)) data = data.reshape((data_length)) print "Train example: {}".format(data[index_train-100:index_train]) print "Valid example: {}".format(data[index_valid-100:index_valid]) print "Test example: {}".format(data[index_test-100:index_test]) features = output_file.create_dataset( name='features' , shape=data.shape, dtype='int16', data=data) #features.dims[0].label = 'batch' #features.dims[0].label = 'time' features.dims[0].label = 'feature' split_dict = { 'train': { 'features' : (0,index_train)}, 'valid': { 'features' : (index_train + 1,index_valid)}, 'test': { 'features' : (index_valid + 1,index_test)} } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def test_pickling(self): try: features = numpy.arange(360, dtype='uint8').reshape((10, 36)) h5file = h5py.File('file.hdf5', mode='w') h5file['features'] = features split_dict = {'train': {'features': (0, 10, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = cPickle.loads( cPickle.dumps(H5PYDataset(h5file, which_set='train'))) assert dataset.data_sources is None finally: os.remove('file.hdf5')
def train_model(): data_path = get_config("data-file") batchs = get_config("batchs") half_batch = batchs // 2 quarter_batch = half_batch // 2 n_batchs = (get_config("train-datasets") or train_set.num_examples) // batchs feature_dim = get_config("feature-dim") or 40 noise_dim = get_config("noise-dim") or 10 input_dim = feature_dim + noise_dim train_set = H5PYDataset(data_path, which_sets=('train', )) handle = train_set.open() generator, discriminator, gan = build_net(input_dim) save_result(0, generator, np.zeros(shape=(1, feature_dim))) for i in range(get_config("epochs")): for j in range(n_batchs): imgs, features = get_batch(train_set, handle, j * batchs) idx = np.random.randint(0, imgs.shape[0], half_batch) real_imgs = imgs[idx] real_features = features[idx] gen_features = get_features(features)[np.random.randint( 0, imgs.shape[0], half_batch)] noise = np.random.normal(0, 1, (half_batch, noise_dim)) gen_imgs = generator.predict([noise, gen_features]) # real feature and real img d_loss_real = discriminator.train_on_batch( [real_imgs], [np.ones((half_batch, 1)), real_features]) # fake feature and fake img d_loss_fake = discriminator.train_on_batch( [gen_imgs], [np.zeros((half_batch, 1)), gen_features]) d_loss = np.add(d_loss_real, d_loss_fake) * 0.5 # train Generator noise = np.random.normal(0, 1, (batchs, noise_dim)) gen_features = get_features(features) g_loss = gan.train_on_batch([noise, gen_features], [np.ones((batchs, 1)), gen_features]) print( "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] %f%%" % (i, d_loss[0], 100 * d_loss[1], g_loss[0], j * 100 / n_batchs)) if i % 10 == 0 and get_config("env") == "GPU": save_result(i, generator, gen_features[0:1, :feature_dim])
def load_all_datasubsets(data_file, slice_to_load): """ Always load data in memory - get all of 'train', 'valid', and 'test' subsets """ if os.path.exists(data_file): dset = H5PYDataset(data_file, which_sets=('train', 'valid', 'test'), subset=slice(slice_to_load[0], slice_to_load[1]), load_in_memory=True) else: raise Exception('Data file', data_file, 'not found!') return dset
def test_h5py_dataset_pickles(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') pickle.loads(pickle.dumps(dataset)) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def test_vlen_reshape_out_of_memory_unordered(self): dataset = H5PYDataset(self.vlen_h5file, which_sets=('train', ), load_in_memory=False) expected_features = numpy.empty((4, ), dtype=numpy.object) for i, j in enumerate([0, 3, 1, 2]): expected_features[i] = self.vlen_features[j] expected_targets = self.vlen_targets[[0, 3, 1, 2]] handle = dataset.open() rval = dataset.get_data(handle, [0, 3, 1, 2]) for val, truth in zip(rval[0], expected_features): assert_equal(val, truth) assert_equal(rval[1], expected_targets) dataset.close(handle)
def output_hdf5(path_list, output_root_dir): num_data = len(path_list) shapes = [] dirs = output_root_dir.split('\\') file_name = dirs[-1] + '.hdf5' output_root_dir = os.path.join(output_root_dir, file_name) f = h5py.File(output_root_dir, mode='w') dtype = h5py.special_dtype(vlen=np.dtype('uint8')) image_features = f.create_dataset('image_features', (num_data,), dtype=dtype) image_features.dims[0].label = 'batch' try: for i in tqdm.tqdm(range(num_data)): image = io.imread(path_list[i]) shapes.append(image.shape) image_features[i] = image.flatten() shapes = np.array(shapes).astype(np.int32) image_features_shapes = f.create_dataset('image_features_shapes', (num_data, 3), dtype=np.int32) image_features_shapes[...] = shapes image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset( 'image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = [ 'height'.encode('utf8'), 'width'.encode('utf8'), 'channel'.encode('utf8')] image_features.dims.create_scale( image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) # specify the splits split_train = (0, num_data) split_dict = dict(train=dict(image_features=split_train)) f.attrs["split"] = H5PYDataset.create_split_array(split_dict) except KeyboardInterrupt: print "割り込み停止が実行されました" f.flush() f.close()
def _binarized_mnist_loader(self): examples, dataset_splits = [], [] for split in ["train", "test", "valid"]: dataset = H5PYDataset(self.loader_path, which_sets=(split,)) data_stream = dataset.get_example_stream() data = list(data_stream.get_epoch_iterator()) examples += data dataset_splits.append(len(data)) random.seed(self.ann_seed) random.shuffle(examples) # guarantees consistency for ttv splits it = iter(examples) return (self._reshape_samples(list(islice(it, 0, i))) for i in dataset_splits)
def test_vlen_reshape_out_of_memory(self): dataset = H5PYDataset( self.vlen_h5file, which_sets=('train',), subset=slice(1, 3), load_in_memory=False) expected_features = numpy.empty((2,), dtype=numpy.object) for i, f in enumerate(self.vlen_features[1:3]): expected_features[i] = f expected_targets = self.vlen_targets[1:3] handle = dataset.open() rval = dataset.get_data(handle, slice(0, 2)) for val, truth in zip(rval[0], expected_features): assert_equal(val, truth) assert_equal(rval[1], expected_targets) dataset.close(handle)
def make_gen(batch_size, examples=4): file_path_f = file_path names_select = i_names train_set = H5PYDataset(file_path_f, which_sets=('train', 'test')) scheme = SequentialScheme(examples=examples, batch_size=batch_size) data_stream_train = DataStream(dataset=train_set, iteration_scheme=scheme) stand_stream_train = ScaleAndShift(data_stream=data_stream_train, scale=scale, shift=shift, which_sources=(names_select[-1],)) return stand_stream_train, train_set, data_stream_train
def fuel_converter(fuel_dataset, embeddings_train, labels_train, embeddings_test, labels_test): f = h5py.File(fuel_dataset, mode='w') labels_train = np.expand_dims(labels_train, axis=1) labels_test = np.expand_dims(labels_test, axis=1) train_sz = embeddings_train.shape[0] - embeddings_train.shape[0] % 100 test_sz = embeddings_test.shape[0] - embeddings_test.shape[0] % 100 feat_sz = embeddings_train.shape[1] dataset_sz = train_sz + test_sz print("Actual Train size : ", embeddings_train.shape[0]) print("Train size in Fuel : ", train_sz) print("Actual Test size : ", embeddings_test.shape[0]) print("Test size in Fuel : ", test_sz) vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64') targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') ## put the data loaded into these objects vector_features[...] = np.vstack( [embeddings_train[0:train_sz], embeddings_test[0:test_sz]]) targets[...] = np.vstack( [labels_train[0:train_sz], labels_test[0:test_sz]]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': { 'features': (0, train_sz), 'targets': (0, train_sz) }, 'test': { 'features': (train_sz, dataset_sz), 'targets': (train_sz, dataset_sz) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def prepare_data(conf): """ Extract strided crops from a set of images and assemble into a 2D matrix. Save into an HDF5 file. Args: conf: dictionary containing data parameters Returns: tr_stream: DataStream for training set te_stream: DataStream for testing set """ preproc.store_hdf5(conf) #, compression='lzf') path_h5 = conf['path_h5'] tr_set = H5PYDataset(path_h5, ('train', ), sources=('LR', 'HR'), load_in_memory=conf['load_in_memory']) tr_scheme = ShuffledScheme(examples=tr_set.num_examples, batch_size=FLAGS.num_gpus * conf['mb_size']) tr_stream = DataStream(dataset=tr_set, iteration_scheme=tr_scheme) te_set = H5PYDataset(path_h5, ('test', ), sources=('LR', 'HR'), load_in_memory=conf['load_in_memory']) te_scheme = SequentialScheme(examples=te_set.num_examples, batch_size=FLAGS.num_gpus * conf['mb_size']) te_stream = DataStream(dataset=te_set, iteration_scheme=te_scheme) if conf['load_in_memory']: print("training set: %d mb" % ((tr_set.data_sources[0].nbytes + \ tr_set.data_sources[1].nbytes) / 1e6)) print("testing set: %d mb" % ((te_set.data_sources[0].nbytes + \ te_set.data_sources[1].nbytes) / 1e6)) time.sleep(2) return tr_stream, te_stream
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) data_mean_std = numpy.load(norm_path) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def test_h5py_flatten_out_of_memory(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 2, 3), dtype='float32') features[...] = numpy.arange(60, dtype='float32').reshape((10, 2, 3)) targets = h5file.create_dataset('targets', (10, ), dtype='uint8') targets[...] = numpy.arange(10, dtype='uint8') split_dict = {'train': {'features': (0, 10), 'targets': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', load_in_memory=False, which_set='train', flatten=['features']) handle = dataset.open() assert_equal( dataset.get_data(state=handle, request=slice(0, 10))[0], numpy.arange(60).reshape((10, 6))) dataset.close(handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def train2(model=None, num_epochs=1, epoch_weights="modelepochweights.h5", \ weights="modelweights.h5", model_save="model.json",\ log_save="modeltraininglog.csv"): if model is not None: dataset_size = 73257# + 531131 #this includes train (73257) and extra (531131) #use 20% as validation validation_size = int(0.2*dataset_size) train_size = dataset_size - validation_size #sequence of 1s and 0s for splitting dataset seq = np.hstack((np.zeros(validation_size),np.ones(train_size))) #randomize np.random.seed(1234) np.random.shuffle(seq) train_idx = np.where(seq==1)[0].tolist() validation_idx = np.where(seq==0)[0].tolist() trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets'), subset=train_idx) validationset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets'), subset=validation_idx) batch_size = 500 epochs_to_wait_for_improve = 15 csv_logger = keras.callbacks.CSVLogger(log_save) check_point = keras.callbacks.ModelCheckpoint(epoch_weights, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1) early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve) history = model.fit_generator(dataset_generator(trainset, batch_size), steps_per_epoch=np.ceil(trainset.num_examples/batch_size), epochs=num_epochs, verbose=2, callbacks=[csv_logger, check_point, early_stopping], validation_data=dataset_generator(validationset, batch_size), validation_steps=np.ceil(validationset.num_examples/batch_size)) save_model(model, weights, model_save) #print accuracy return history
def setUp(self): self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) self.h5file = h5py.File( 'file.hdf5', mode='w', driver='core', backing_store=False) self.h5file['features'] = self.features self.h5file['features'].dims[0].label = 'batch' self.h5file['features'].dims[1].label = 'feature' self.h5file['targets'] = self.targets self.h5file['targets'].dims[0].label = 'batch' self.h5file['targets'].dims[1].label = 'index' split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)}, 'test': {'features': (20, 30, ''), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100)}} self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def load_datasubset(data_file, subset, slice_to_load): """ Always load data in memory subset = 'train', 'valid', or 'test' slice_to_load = a tuple (not a slice object) with start, stop event #'s """ if os.path.exists(data_file): dset = H5PYDataset(data_file, which_sets=(subset, ), subset=slice(slice_to_load[0], slice_to_load[1]), load_in_memory=True) else: raise Exception('Data file', data_file, 'not found!') return dset
def emboot_converter_traintrain(emboot_dataset): train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np( ) f = h5py.File(emboot_dataset, mode='w') train_sz = train_vector_features.shape[0] test_sz = test_vector_features.shape[0] feat_sz = train_vector_features.shape[1] dataset_sz = ( train_sz + test_sz - 16 ) * 2 ## NOTE: 13900 * 2 (copy over the train data to the test dataset) vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64') ## train + test targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') ## put the data loaded into these objects train_vector_features_aug = np.vstack( [train_vector_features, test_vector_features])[:13900] train_targets_aug = np.vstack([train_targets, test_targets])[:13900] vector_features[...] = np.vstack( [train_vector_features_aug, train_vector_features_aug]) targets[...] = np.vstack([train_targets_aug, train_targets_aug]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': { 'features': (0, dataset_sz / 2), 'targets': (0, dataset_sz / 2) }, 'test': { 'features': (dataset_sz / 2, dataset_sz), 'targets': (dataset_sz / 2, dataset_sz) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def add_phonemes(): data_path = os.environ["FUEL_DATA_PATH"] data_path = os.path.join(data_path, "blizzard/") save_name = "sp_blizzard_80h_phon.hdf5" phon_file = "tbptt_blizzard_80h.hdf5" data_file = "sp_blizzard_80h.hdf5" save_path = os.path.join(data_path, save_name) phon_path = os.path.join(data_path, phon_file) data_path = os.path.join(data_path, data_file) resulth5 = h5py.File(save_path, mode="w") phonh5 = h5py.File(phon_path, mode="r") datah5 = h5py.File(data_path, mode="r") sp_h5 = resulth5.create_dataset("sp", (TOTAL_ROWS, 512, 257), dtype="float32") f0_h5 = resulth5.create_dataset("f0", (TOTAL_ROWS, 512), dtype="float32") phon_h5 = resulth5.create_dataset("phonemes", (TOTAL_ROWS, 512), dtype="int16") f0_h5[:] = datah5["f0"][:] phon_h5[:] = phonh5["phonemes"][:, ::64] n_times = 100 idx = chunkIt(range(TOTAL_ROWS), n_times) for num_indx, indx in enumerate(idx): print num_indx, 100 sp_h5[indx] = datah5["sp"][indx] cont = TOTAL_ROWS end_train = int(0.9 * cont) end_valid = int(0.95 * cont) end_test = cont split_dict = { "train": {"sp": (0, end_train), "f0": (0, end_train), "phonemes": (0, end_train)}, "valid": {"sp": (end_train, end_valid), "f0": (end_train, end_valid), "phonemes": (end_train, end_valid)}, "test": {"sp": (end_valid, end_test), "f0": (end_valid, end_test), "phonemes": (end_valid, end_test)}, } resulth5.attrs["split"] = H5PYDataset.create_split_array(split_dict) resulth5.flush() resulth5.close() phonh5.close() datah5.close()
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: h5_dataset.default_transformers = uint8_pixels_to_floatX( ('features', )) datastream = DataStream.default_stream( dataset=h5_dataset, iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples)) if label_transforms: # TODO: maybe refactor this common bit with get_custom_streams below datastream = AddLabelUncertainty(datastream, chance=0, which_sources=('targets', )) datastream = RandomLabelStrip(datastream, chance=0, which_sources=('targets', )) # HACK: allow variable stretch datastream = StretchLabels(datastream, length=128, which_sources=('targets', )) it = datastream.get_epoch_iterator() if return_length: return it, h5_dataset.num_examples else: return it
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds): indices_dict = indices_dict_or_numfolds if isinstance(indices_dict, int): folds = indices_dict n = max(len(it) for it in dataset_dict.values()) fold_n = n // folds indices_dict = dict(("fold_{}".format(i), (i * fold_n, (i + 1) * fold_n)) for i in range(folds)) print indices_dict f = h5py.File(destfile, mode="w") for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems()) f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_axis_labels(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features.dims[0].label = 'batch' features.dims[1].label = 'feature' features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') assert dataset.axis_labels == {'features': ('batch', 'feature')} finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def train(model=None): if model is not None: trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets')) trainstream = DataStream(trainset, iteration_scheme=SequentialScheme(examples=trainset.num_examples, batch_size=500)) for data in trainstream.get_epoch_iterator(): images, labels = data #standardize the input images m = images.mean(axis=(2,3), keepdims=True) s = images.std(axis=(2,3), keepdims=True) images = (images - m)/s #change from "channel_first" to "channel_last" images = np.transpose(images, (0,2,3,1)) labels = keras.utils.to_categorical(labels) #print images.shape model.train_on_batch(x=images, y=labels) trainstream.close()
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if delay: fs = DelayTransformer(fs, delay) fs = FilterSources(data_stream=fs, sources=['features', 'ivectors']) return Padding(fs)
def test_pickling(self): try: features = numpy.arange(360, dtype='uint16').reshape((10, 36)) h5file = h5py.File('file.hdf5', mode='w') h5file['features'] = features split_dict = {'train': {'features': (0, 10, None, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = cPickle.loads( cPickle.dumps(H5PYDataset(h5file, which_sets=('train',)))) # Make sure _out_of_memory_{open,close} accesses # external_file_handle rather than _external_file_handle dataset._out_of_memory_open() dataset._out_of_memory_close() assert dataset.data_sources is None finally: os.remove('file.hdf5')
def text_to_h5py_dataset(text_path, dst_path): # The simplest is to load everything to memory first. # If memory becomes an issue, this code can be optimized. words = [] with open(text_path, 'r') as src: for line in src: words.extend(line.strip().split()) with h5py.File(dst_path, 'w') as dst: dtype = h5py.special_dtype(vlen=bytes) table = dst.create_dataset('words', (len(words), ), dtype=dtype) table[:] = words dst.attrs['split'] = H5PYDataset.create_split_array( {'train': { 'words': (0, len(words)) }})
def test_no_aug(dataset_used, model=None, testset=('test', 'test_neg',)): #include neg samples if model is not None: #accuracies = [] #dataset_size = H5PYDataset('new.hdf5', which_sets=('test','test_neg')).num_examples #seq = np.arange(dataset_size) #np.random.seed(1234) #np.random.shuffle(seq) #test_idx=seq.tolist() batch_size = 500 #dataset_used = "new_more_neg.hdf5" testset = H5PYDataset(dataset_used, which_sets=testset, sources=('features', 'targets')) loss, accuracy = model.evaluate_generator(dataset_generator(testset, batch_size), steps=np.ceil(testset.num_examples/batch_size), max_queue_size=11, workers=1, use_multiprocessing=False) return loss, accuracy
def gen_vlen_dataset(): import h5py from fuel.datasets.hdf5 import H5PYDataset sizes = numpy.random.randint(3, 9, size=(100,)) train_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]] test_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]] f = h5py.File('dataset_vlen.h5', mode='w') f['vector_features'] = numpy.vstack([numpy.load('train_vector_features.npy'), numpy.load('test_vector_features.npy')]) f['targets'] = numpy.vstack([numpy.load('train_targets.npy'), numpy.load('test_targets.npy')]) f['vector_features'].dims[0].label = 'batch' f['vector_features'].dims[1].label = 'feature' f['targets'].dims[0].label = 'batch' f['targets'].dims[1].label = 'index' all_image_features = train_image_features + test_image_features dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) image_features = f.create_dataset('image_features', (100,), dtype=dtype) image_features[...] = [image.flatten() for image in all_image_features] image_features.dims[0].label = 'batch' image_features_shapes = f.create_dataset('image_features_shapes', (100, 3), dtype='int32') image_features_shapes[...] = numpy.array([image.shape for image in all_image_features]) image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset('image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] image_features.dims.create_scale(image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) split_dict = {'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)}, 'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() train_set = H5PYDataset('dataset_vlen.h5', which_sets=('train',), sources=('image_features',)) print(train_set.axis_labels['image_features']) handle = train_set.open() images, = train_set.get_data(handle, slice(0, 10)) train_set.close(handle) print(images[0].shape, images[1].shape, images[2].shape, images[3].shape)
def make_lsun_dataset(scene_path, fuel_hdf5_path, resize_shape): # get image list image_list = [] for root, dirs, files in os.walk(scene_path): for filename in fnmatch.filter(files, "*.jpg"): image_list.append(os.path.join(root, filename)) num_images = len(image_list) print "num of images :{}".format(num_images) # open image file fuel_file = h5py.File(name=fuel_hdf5_path, mode="w") # set new dataset for fuel file image_data = fuel_file.create_dataset(name="image_data", shape=(num_images, 3) + resize_shape, dtype="uint8") for idx, filepath in enumerate(image_list): original_image = Image.open(filepath).convert("RGB") resize_row = resize_shape[0] if original_image.size[0] < original_image.size[1] else original_image.size[0] resize_col = resize_shape[1] if original_image.size[0] > original_image.size[1] else original_image.size[1] original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS) if original_image.size[0] != resize_shape[0]: excess = (original_image.size[0] - resize_shape[0]) / 2 original_image = original_image.crop((excess, 0, resize_shape[0] + excess, resize_shape[0])) elif original_image.size[1] != resize_shape[1]: excess = (original_image.size[1] - resize_shape[1]) / 2 original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1] + excess)) original_image = numpy.asarray(original_image) image_data[idx] = numpy.transpose(original_image, (2, 0, 1)) image_data.dims[0].label = "batch" image_data.dims[1].label = "channel" image_data.dims[2].label = "height" image_data.dims[3].label = "width" split_dict = {"train": {"image_data": (0, num_images)}} fuel_file.attrs["split"] = H5PYDataset.create_split_array(split_dict) fuel_file.flush() fuel_file.close() print "DONE : {} (num of images :{})".format(fuel_hdf5_path, num_images)
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds): indices_dict = indices_dict_or_numfolds if isinstance(indices_dict, int): folds = indices_dict n = max(len(it) for it in dataset_dict.values()) fold_n = n // folds indices_dict = dict(('fold_{}'.format(i), (i*fold_n, (i+1)*fold_n)) \ for i in range(folds)) print indices_dict f = h5py.File(destfile, mode='w') for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_out_of_memory_unsorted_indices(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset( path='tmp.hdf5', which_set='train', load_in_memory=False, sort_indices=False) handle = dataset.open() assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5]) finally: dataset.close(handle) if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def save_hd5py (out_path, data, folds = 0): images = np.concatenate([a[0] for a in data], axis = 0) labels = np.concatenate([a[1] for a in data], axis = 0) f = h5py.File(out_path, mode='w') ds = f.create_dataset('images', images.shape, dtype=str(images.dtype)) ds[...] = images ds = f.create_dataset('labels', labels.shape, dtype=str(labels.dtype)) ds[...] = labels #assert(folds > 1) #if folds > 1: fold = len(images) // folds idx = {'fold-{}'.format(i): (i*fold, (i+1)*fold) for i in range(folds)} print idx split_dict = {k: {'images': v, 'labels':v} for k, v in idx.iteritems()} from fuel.datasets.hdf5 import H5PYDataset f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() pass
def test_h5py_dataset_out_of_memory(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset( path='tmp.hdf5', which_set='train', load_in_memory=False) handle = dataset.open() assert_equal( dataset.get_data(state=handle, request=slice(0, 10))[0], numpy.arange(50).reshape((10, 5))) dataset.close(handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N): hdf5name = 'mushrooms.hdf5' f = h5py.File(hdf5name, mode='w') fx = f.create_dataset('x', np_enc_data.shape, dtype='float32') fy = f.create_dataset('y', np_enc_y.shape, dtype='int64') fx[...] = np_enc_data fy[...] = np_enc_y split_dict = { 'train': {'x': (0,splitpoint), 'y': (0, splitpoint)}, 'test': {'x': (splitpoint, N), 'y': (splitpoint, N)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def make_celeb_dataset(fuel_hdf5_path, resize_shape): # get image list image_list = glob.glob(CELEBA_FACE_FOLDER + '*.jpg') num_images = len(image_list) # open image file fuel_file = h5py.File(name=fuel_hdf5_path, mode='w') # set new dataset for fuel file image_data = fuel_file.create_dataset(name='image_data', shape=(num_images, 3) + resize_shape, dtype='uint8') for idx, filepath in enumerate(image_list): original_image = Image.open(filepath).convert('RGB') resize_row = resize_shape[0] if original_image.size[0]<original_image.size[1] else original_image.size[0] resize_col = resize_shape[1] if original_image.size[0]>original_image.size[1] else original_image.size[1] original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS) if original_image.size[0] != resize_shape[0]: excess = (original_image.size[0] - resize_shape[0]) / 2 original_image = original_image.crop((excess, 0, resize_shape[0]+excess, resize_shape[0])) elif original_image.size[1] != resize_shape[1]: excess = (original_image.size[1] - resize_shape[1]) / 2 original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1]+excess)) original_image = numpy.asarray(original_image) image_data[idx] = numpy.transpose(original_image, (2, 0, 1)) image_data.dims[0].label = 'batch' image_data.dims[1].label = 'channel' image_data.dims[2].label = 'height' image_data.dims[3].label = 'width' split_dict = { 'train' : {'image_data': (0, num_images)}} fuel_file .attrs['split'] = H5PYDataset.create_split_array(split_dict) fuel_file.flush() fuel_file.close() print 'DONE : {} (num of images :{})'.format(fuel_hdf5_path, num_images)
def test_value_error_on_unequal_sources(self): def get_subsets(): return H5PYDataset(self.h5file, which_sets=('train', )).subsets split_dict = { 'train': { 'features': (0, 20), 'targets': (0, 15) }, 'test': { 'features': (20, 30), 'targets': (20, 30) }, 'unlabeled': { 'features': (30, 100, None, '.') } } self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) assert_raises(ValueError, get_subsets)
def add_split_dict(hdf5file, names, total_examples, train_frac=0.83, valid_frac=0.10): # TODO: investiage the "reference" stuff so we can pluck validation # and testing events evenly from the sample final_train_index = int(total_examples * train_frac) final_valid_index = int(total_examples * (train_frac + valid_frac)) train_dict = {name: (0, final_train_index) for name in names} valid_dict = {name: (final_train_index, final_valid_index) for name in names} test_dict = {name: (final_valid_index, total_examples) for name in names} split_dict = { 'train': train_dict, 'valid': valid_dict, 'test': test_dict } hdf5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def emboot_converter(): train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np( ) f = h5py.File(emboot_dataset, mode='w') train_sz = train_vector_features.shape[0] test_sz = test_vector_features.shape[0] feat_sz = train_vector_features.shape[1] dataset_sz = train_sz + test_sz dataset_sz_new = 13000 vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64') ## train + test targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') ## put the data loaded into these objects vector_features[...] = np.vstack( [train_vector_features, test_vector_features]) targets[...] = np.vstack([train_targets, test_targets]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': { 'features': (0, 10400), 'targets': (0, 10400) }, 'test': { 'features': (10400, 13000), 'targets': (10400, 13000) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def main(): print('Loading categories') category2idx = json.load( open(os.path.join(DATA_PATH, 'categories.json'), 'rt')) print('Loading data in memory') dataset = H5PYDataset( DATA_TEMPLATE % (IMG_SIZE, IMG_SIZE), sources=['input_category', 'input_description', 'input_image'], which_sets=('all', ), load_in_memory=True) classes, texts, images = dataset.data_sources classes = np.array([cls[0] for cls in classes]) texts = np.array([txt[0] for txt in texts]) print("There are %i examples" % dataset.num_examples) print(texts.shape, images.shape, classes.shape) print("N. examples: %i, fst: %s" % (len(classes), classes[0])) # prepare filenames print("Creating filenames") filenames = create_captions(classes, texts, category2idx, False, False) print("N. files: %i, fst: %s" % (len(filenames), filenames[0])) train_idx, test_idx, _, _ = get_split(classes, classes.reshape(-1, 1), 0.1, seed=2) print('Loading embedding model') model = Model(os.path.join(MODEL_PATH, 'frozen_model.pb'), os.path.join(MODEL_PATH, 'tokenizer.pickle'), maxlen=LIMIT) print('Saving test data') dump_all(classes, filenames, images, texts, test_idx, model, os.path.join(DATA_PATH, 'test')) print('Saving train data') dump_all(classes, filenames, images, texts, train_idx, model, os.path.join(DATA_PATH, 'train'))
def test_text_to_h5py_dataset(): test_dir = tempfile.mkdtemp() text_path = os.path.join(test_dir, 'text.txt') h5_path = os.path.join(test_dir, 'words.h5') with open(os.path.join(test_dir, 'text.txt'), 'w') as dst: print('abc', file=dst) print('été', file=dst) print('abc Δίας', file=dst) text_to_h5py_dataset(text_path, h5_path) f = H5PYDataset(h5_path, ('train', )) it = f.get_example_stream().get_epoch_iterator() assert next(it)[0] == 'abc' assert next(it)[0] == 'été' assert next(it)[0] == 'abc' assert next(it)[0] == 'Δίας' os.remove(text_path) os.remove(h5_path) os.rmdir(test_dir)
def add_sets(args): with h5py.File(args.h5file, 'a') as h5file: sources = [] for dataset in h5file: if (dataset.endswith('_indices') or dataset.endswith('_shapes') or dataset.endswith('_shape_labels')): continue sources.append(dataset) uttid2idx = { uttid: idx for (idx, uttid) in enumerate(h5file['uttids']) } split_dict = {} for subset in args.sets: name, uttids_fname = subset.split('=') idxs = [] with open(uttids_fname) as uf: for l in uf: uttid = l.strip().split()[0] idxs.append(uttid2idx[uttid]) indices_name = '{}_indices'.format(name) if indices_name in h5file: del h5file[indices_name] # # Note: ideally, we would sort the indeces and do: # h5file[indices_name] = numpy.array(sorted(idxs)) # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid! # h5file[indices_name] = numpy.array(idxs) indices_ref = h5file[indices_name].ref split_dict[name] = { source: (-1, -1, indices_ref) for source in sources } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def create_ivector_datastream(path, which_set, batch_size=1, delay=0, min_after_cache=1024, length_sort=False): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if length_sort: fs = LengthSortTransformer(fs, batch_size, min_after_cache) if delay: fs = DelayTransformer(fs, delay) return Padding(fs)
def load_stream(batch_size=64, source=None, img=None): if source is None: raise ValueError('No source provided') logger.info( 'Loading data from `{}` (using {}x{}) and quantizing to {} colors'.format( source, DIM_X, DIM_Y, N_COLORS)) f = h5py.File(source, 'r') arr = f['features'][:1000] arr = arr.transpose(0, 2, 3, 1) arr = arr.reshape((arr.shape[0] * arr.shape[1], arr.shape[2], arr.shape[3])) img = Image.fromarray(arr).convert( 'P', palette=Image.ADAPTIVE, colors=N_COLORS) train_data = H5PYDataset(source, which_sets=('train',)) num_train = train_data.num_examples train_scheme = ShuffledScheme(examples=num_train, batch_size=batch_size) train_stream = To8Bit(img=img, data_stream=DataStream( train_data, iteration_scheme=train_scheme)) return train_stream, num_train, img
def make_gen(Nchunks=True, classif=False, train=True): ''' Nchunks==True : 10 chunks in the generator Nchunks == False : 1 chunk in the generator Makes the distinction between classification/regression Makes the distinction between test/train ''' file_path_f = file_path_R shift_f = shift_R scale_f = scale_R if classif: file_path_f = file_path_C shift_f = shift_C scale_f = scale_C if Nchunks: batch_size = 13 else: batch_size = 130 t_scheme = SequentialScheme(examples=130, batch_size=batch_size) t_source = 'train' if not train: if Nchunks: batch_size = 2 else: batch_size = 20 t_source = 'test' t_scheme = SequentialScheme(examples=20, batch_size=batch_size) t_set = H5PYDataset(file_path_f, which_sets=[t_source]) data_stream_t = DataStream(dataset=t_set, iteration_scheme=t_scheme) stand_stream_t = ScaleAndShift(data_stream=data_stream_t, scale=scale_f, shift=shift_f, which_sources=t_source) return stand_stream_t, t_set, data_stream_t
def remove_files_from_dataset(hdf5filename1, subset, noms, noms_to_remove, output_): from fuel.datasets.hdf5 import H5PYDataset set1 = H5PYDataset(hdf5filename1, which_sets=(subset,)) print 'before:', set1.num_examples handle1 = set1.open() data1 = set1.get_data(handle1, slice(0, set1.num_examples)) set1.close(handle1) Xarray = [] yarray = [] for ind, nom in enumerate(noms): if nom in noms_to_remove: continue Xarray.append(data1[0][ind]) yarray.append(data1[1][ind]) Xarray = np.asarray(Xarray) yarray = np.asarray(yarray) print 'after:', Xarray.shape create_hdf5_from_arrays(Xarray, yarray, output_)
def createH5Dataset(hdf5_out, corpus_path, sequence_length): with open(corpus_path) as f: corpus = f.read().split(",") (indices, vocab) = pd.factorize(list(corpus)) instances_num = len(corpus) // (sequence_length + 1) f = h5py.File(hdf5_out, mode='w') train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8) train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8) for j in range(instances_num): for i in range(sequence_length): train_data_x[j][i] = indices[i + j * (sequence_length + 1)] train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1] char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8') char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8') char_in[...] = train_data_x char_out[...] = train_data_y split_dict = { 'train': { 'inchar': (0, instances_num), 'outchar': (0, instances_num) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.attrs["vocab"] = json.dumps(list(vocab)) f.flush() f.close()