def setUp(self): self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) h5file = h5py.File( 'file.hdf5', mode='w', driver='core', backing_store=False) h5file['features'] = self.features h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'] = self.targets h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)}, 'test': {'features': (20, 30), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100, None, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.h5file = h5file vlen_h5file = h5py.File( 'test_vl.hdf5', mode='w', driver='core', backing_store=False) self.vlen_features = [ numpy.arange(12, dtype='uint8').reshape((3, 2, 2)), numpy.arange(48, dtype='uint8').reshape((3, 4, 4)), numpy.arange(60, dtype='uint8').reshape((3, 5, 4)), numpy.arange(18, dtype='uint8').reshape((3, 2, 3))] self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1)) dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) features = vlen_h5file.create_dataset('features', (4,), dtype=dtype) features[...] = [d.flatten() for d in self.vlen_features] features.dims[0].label = 'batch' features_shapes = vlen_h5file.create_dataset( 'features_shapes', (4, 3), dtype='uint8') features_shapes[...] = numpy.array( [d.shape for d in self.vlen_features]) features.dims.create_scale(features_shapes, 'shapes') features.dims[0].attach_scale(features_shapes) features_shape_labels = vlen_h5file.create_dataset( 'features_shape_labels', (3,), dtype='S7') features_shape_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] features.dims.create_scale(features_shape_labels, 'shape_labels') features.dims[0].attach_scale(features_shape_labels) targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8') targets[...] = self.vlen_targets targets.dims[0].label = 'batch' targets.dims[1].label = 'index' split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}} vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.vlen_h5file = vlen_h5file
def setUp(self): self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) h5file = h5py.File( 'file.hdf5', mode='w', driver='core', backing_store=False) h5file['features'] = self.features h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'] = self.targets h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)}, 'test': {'features': (20, 30, ''), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.h5file = h5file vlen_h5file = h5py.File( 'test_vl.hdf5', mode='w', driver='core', backing_store=False) self.vlen_features = [ numpy.arange(12, dtype='uint8').reshape((3, 2, 2)), numpy.arange(48, dtype='uint8').reshape((3, 4, 4)), numpy.arange(60, dtype='uint8').reshape((3, 5, 4)), numpy.arange(18, dtype='uint8').reshape((3, 2, 3))] self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1)) dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) features = vlen_h5file.create_dataset('features', (4,), dtype=dtype) features[...] = [d.flatten() for d in self.vlen_features] features.dims[0].label = 'batch' features_shapes = vlen_h5file.create_dataset( 'features_shapes', (4, 3), dtype='uint8') features_shapes[...] = numpy.array( [d.shape for d in self.vlen_features]) features.dims.create_scale(features_shapes, 'shapes') features.dims[0].attach_scale(features_shapes) features_shape_labels = vlen_h5file.create_dataset( 'features_shape_labels', (3,), dtype='S7') features_shape_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] features.dims.create_scale(features_shape_labels, 'shape_labels') features.dims[0].attach_scale(features_shape_labels) targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8') targets[...] = self.vlen_targets targets.dims[0].label = 'batch' targets.dims[1].label = 'index' split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}} vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) self.vlen_h5file = vlen_h5file
def createH5Dataset(hdf5_out, corpus_path, sequence_length): with open(corpus_path) as f: corpus = f.read().split(",") (indices, vocab) = pd.factorize(list(corpus)) instances_num = len(corpus) // (sequence_length + 1) f = h5py.File(hdf5_out, mode='w') train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8) train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8) for j in range(instances_num): for i in range(sequence_length): train_data_x[j][i] = indices[i + j * (sequence_length + 1)] train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1] char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8') char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8') char_in[...] = train_data_x char_out[...] = train_data_y split_dict = { 'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.attrs["vocab"] = json.dumps(list(vocab)) f.flush() f.close()
def emboot_converter_traintrain(emboot_dataset): train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np() f = h5py.File(emboot_dataset, mode='w') train_sz = train_vector_features.shape[0] test_sz = test_vector_features.shape[0] feat_sz = train_vector_features.shape[1] dataset_sz = (train_sz + test_sz) * 2 ## NOTE: 67000 * 2 (copy over the train data to the test dataset) vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64') ## train + test targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') train_vector_features_aug = np.vstack([train_vector_features, test_vector_features]) train_targets_aug = np.vstack([train_targets, test_targets]) ## put the data loaded into these objects vector_features[...] = np.vstack([train_vector_features_aug, train_vector_features_aug]) targets[...] = np.vstack([train_targets_aug, train_targets_aug]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': {'features': (0, dataset_sz/2), 'targets': (0, dataset_sz/2)}, 'test': {'features': (dataset_sz/2, dataset_sz), 'targets': (dataset_sz/2, dataset_sz)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_out_of_memory(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) targets = h5file.create_dataset('targets', (10, 1), dtype='float32') targets[...] = numpy.arange(10, dtype='float32').reshape((10, 1)) split_dict = { 'train': { 'features': (0, 5), 'targets': (0, 5) }, 'test': { 'features': (5, 10), 'targets': (5, 10) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='test', load_in_memory=False) handle = dataset.open() assert_equal( dataset.get_data(state=handle, request=slice(3, 5))[1], numpy.arange(10).reshape((10, 1))[8:10]) finally: dataset.close(handle) if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def test_h5py_dataset_split(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 8)}, 'test': {'features': (8, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() train_set = H5PYDataset(path='tmp.hdf5', which_set='train') test_set = H5PYDataset(path='tmp.hdf5', which_set='test') train_handle = train_set.open() test_handle = test_set.open() assert_equal( train_set.get_data(state=train_handle, request=slice(0, 8))[0], numpy.arange(50).reshape((10, 5))[:8]) assert_equal( test_set.get_data(state=test_handle, request=slice(0, 2))[0], numpy.arange(50).reshape((10, 5))[8:]) train_set.close(train_handle) test_set.close(test_handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def add_sets(args): with h5py.File(args.h5file, "a") as h5file: sources = [] for dataset in h5file: if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"): continue sources.append(dataset) uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])} split_dict = {} for subset in args.sets: name, uttids_fname = subset.split("=") idxs = [] with open(uttids_fname) as uf: for l in uf: uttid = l.strip().split()[0] idxs.append(uttid2idx[uttid]) indices_name = "{}_indices".format(name) if indices_name in h5file: del h5file[indices_name] # # Note: ideally, we would sort the indeces and do: # h5file[indices_name] = numpy.array(sorted(idxs)) # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid! # h5file[indices_name] = numpy.array(idxs) indices_ref = h5file[indices_name].ref split_dict[name] = {source: (-1, -1, indices_ref) for source in sources} h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
def setUp(self): self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) self.h5file = h5py.File('file.hdf5', mode='w', driver='core', backing_store=False) self.h5file['features'] = self.features self.h5file['features'].dims[0].label = 'batch' self.h5file['features'].dims[1].label = 'feature' self.h5file['targets'] = self.targets self.h5file['targets'].dims[0].label = 'batch' self.h5file['targets'].dims[1].label = 'index' split_dict = { 'train': { 'features': (0, 20, '.'), 'targets': (0, 20) }, 'test': { 'features': (20, 30, ''), 'targets': (20, 30) }, 'unlabeled': { 'features': (30, 100) } } self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def test_index_split_in_memory(self): features = numpy.arange(50, dtype='uint8').reshape((10, 5)) h5file = h5py.File('index_split.hdf5', mode='w', driver='core', backing_store=False) h5file['features'] = features h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['train_features_subset'] = numpy.arange(0, 10, 2) h5file['test_features_subset'] = numpy.arange(1, 10, 2) train_ref = h5file['train_features_subset'].ref test_ref = h5file['test_features_subset'].ref split_dict = { 'train': { 'features': (-1, -1, train_ref, '.') }, 'test': { 'features': (-1, -1, test_ref, '') } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = H5PYDataset(h5file, which_sets=('train', ), load_in_memory=True) handle = dataset.open() request = slice(0, 5) assert_equal(dataset.get_data(handle, request)[0], features[0:10:2]) assert_equal(dataset.num_examples, 5) dataset.close(handle)
def test_h5py_dataset_split(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = { 'train': { 'features': (0, 8) }, 'test': { 'features': (8, 10) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() train_set = H5PYDataset(path='tmp.hdf5', which_set='train') test_set = H5PYDataset(path='tmp.hdf5', which_set='test') train_handle = train_set.open() test_handle = test_set.open() assert_equal( train_set.get_data(state=train_handle, request=slice(0, 8))[0], numpy.arange(50).reshape((10, 5))[:8]) assert_equal( test_set.get_data(state=test_handle, request=slice(0, 2))[0], numpy.arange(50).reshape((10, 5))[8:]) train_set.close(train_handle) test_set.close(test_handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def test_h5py_dataset_split_parsing(): try: h5file = h5py.File('tmp.hdf5', mode="w") features = h5file.create_dataset('features', (100, 36), dtype='uint8') features[...] = numpy.zeros(shape=(100, 36)).astype('uint8') targets = h5file.create_dataset('targets', (30, 1), dtype='uint8') targets[...] = numpy.zeros(shape=(30, 1)).astype('uint8') split_dict = { 'train': { 'features': (0, 20), 'targets': (0, 20) }, 'test': { 'features': (20, 30), 'targets': (20, 30) }, 'unlabeled': { 'features': (30, 100) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() train_set = H5PYDataset(path='tmp.hdf5', which_set='train') assert train_set.provides_sources == ('features', 'targets') test_set = H5PYDataset(path='tmp.hdf5', which_set='test') assert test_set.provides_sources == ('features', 'targets') unlabeled_set = H5PYDataset(path='tmp.hdf5', which_set='unlabeled') assert unlabeled_set.provides_sources == ('features', ) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def data(): try: hf["target"].shape except: hf = h5py.File('faces.hdf5','r+') num_samples = hf["input"].shape[0] print "number of samples in dataset : %i" %num_samples split_dict = { 'train': {'input': (2000, num_samples), 'target': (2000, num_samples)}, 'test': {'input': (0, 1000), 'target': (0, 1000)}, 'val': {'input': (1000, 2000), 'target': (1000, 2000)} } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) train_set = H5PYDataset('faces.hdf5', which_sets=('train',)) test_set = H5PYDataset('faces.hdf5', which_sets=('test',)) val_set = H5PYDataset('faces.hdf5', which_sets=('val',)) batch_size = 128 #TODO : use shuffledscheme instead? Seems slower, might have screwed up the chunksize in the HDF5 files? tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size) tr_stream = DataStream(train_set, iteration_scheme=tr_scheme) val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size) val_stream = DataStream(val_set, iteration_scheme=val_scheme) test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) hf.close() return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
def CreateHDF5(): sizes = numpy.random.randint(3,9, size=(100,)) train_image_features = [ numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]] test_image_features = [ numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]] train_vector_features = numpy.random.normal(size=(90,10)).astype('float32') test_vector_features = numpy.random.normal(size=(10,10)).astype('float32') train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8') test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8') f = h5py.File('dataset.hdf5', mode='w') vector_features = f.create_dataset( 'vector_features', (100, 10), dtype='float32') targets = f.create_dataset( 'targets', (100, 1), dtype='uint8') vector_features[...] = numpy.vstack( [train_vector_features, test_vector_features]) targets[...] = numpy.vstack([train_targets, test_targets]) vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' all_image_features = train_image_features + test_image_features dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) image_features = f.create_dataset('image_features', (100,), dtype=dtype) image_features[...] = [image.flatten() for image in all_image_features] image_features.dims[0].label='batch' image_features_shapes = f.create_dataset( 'image_features_shapes', (100, 3), dtype='int32') image_features_shapes[...] = numpy.array( [image.shape for image in all_image_features]) image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset( 'image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] image_features.dims.create_scale( image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) split_dict = { 'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)}, 'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_h5py(tn, start, stop): cf = train_features[start:stop] ct = train_targets[start:stop] np.save(pjoin(numpy_path, prefix + tn + '_features.npy'), cf) np.save(pjoin(numpy_path, prefix + tn + '_targets.npy'), ct) h5 = h5py.File(pjoin(fuel_path, prefix + tn + '.hdf5'), mode='w') h5_features = h5.create_dataset('features', (cf.shape[0], cf.shape[1] * mult), dtype='float32') lenf = stop - start with ProgressBar(maxval=lenf) as progbar: for i in range(lenf): arr = [] for j in range(-concat[0], concat[0] + 1, concat[1]): arr.extend(cf[(i - j) % lenf]) h5_features[i] = np.asarray(arr) progbar.update(i) h5_targets = h5.create_dataset('targets', ct.shape, dtype='uint16') h5_targets[...] = ct h5_features.dims[0].label = 'batch' h5_features.dims[1].label = 'feature' h5_targets.dims[0].label = 'batch' h5_targets.dims[1].label = 'index' split_dict = { tn: { 'features': (0, stop - start), 'targets': (0, stop - start) }, #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))}, } h5.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5.flush() h5.close()
def add_sets(args): with h5py.File(args.h5file, 'a') as h5file: sources = [] for dataset in h5file: if (dataset.endswith('_indices') or dataset.endswith('_shapes') or dataset.endswith('_shape_labels')): continue sources.append(dataset) uttid2idx = {uttid:idx for (idx,uttid) in enumerate(h5file['uttids']) } split_dict = {} for subset in args.sets: name, uttids_fname = subset.split('=') idxs = [] with open(uttids_fname) as uf: for l in uf: uttid = l.strip().split()[0] idxs.append(uttid2idx[uttid]) indices_name = '{}_indices'.format(name) if indices_name in h5file: del h5file[indices_name] # # Note: ideally, we would sort the indeces and do: # h5file[indices_name] = numpy.array(sorted(idxs)) # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid! # h5file[indices_name] = numpy.array(idxs) indices_ref = h5file[indices_name].ref split_dict[name] = {source : (-1, -1, indices_ref) for source in sources} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N): hdf5name = 'mushrooms.hdf5' f = h5py.File(hdf5name, mode='w') fx = f.create_dataset('x', np_enc_data.shape, dtype='float32') fy = f.create_dataset('y', np_enc_y.shape, dtype='int64') fx[...] = np_enc_data fy[...] = np_enc_y split_dict = { 'train': { 'x': (0, splitpoint), 'y': (0, splitpoint) }, 'test': { 'x': (splitpoint, N), 'y': (splitpoint, N) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_h5py(tn, start, stop): cf = train_features[start:stop] ct = train_targets[start:stop] np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf) np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct) h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w') h5_features = h5.create_dataset( 'features', (cf.shape[0], cf.shape[1]*mult) , dtype='float32') lenf = stop - start with ProgressBar(maxval=lenf) as progbar: for i in range(lenf): arr = [] for j in range(-concat[0], concat[0]+1, concat[1]): arr.extend(cf[(i-j)%lenf]) h5_features[i] = np.asarray(arr) progbar.update(i) h5_targets = h5.create_dataset( 'targets', ct.shape, dtype='uint16') h5_targets[...] = ct h5_features.dims[0].label = 'batch' h5_features.dims[1].label = 'feature' h5_targets.dims[0].label = 'batch' h5_targets.dims[1].label = 'index' split_dict = { tn: {'features': (0, stop-start), 'targets': (0, stop-start)}, #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))}, } h5.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5.flush() h5.close()
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size): [rate, signal] = wav.read(wav_name) num_steps = signal.shape[0] num_seqs = num_steps-window_size output_path = '{}.hdf5'.format(hdf5_name) output_path = os.path.join(output_path) signal = signal.reshape(num_steps,1) with h5py.File(output_path, mode='w') as h5file: input_feature = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16') target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16') print ' num of sequences : {}'.format(num_seqs) for s in xrange(num_seqs): input_feature[s] = signal[s:s+window_size] target_feature[s] = signal[(s+1):(s+1)+window_size] # label each dataset axis input_feature.dims[0].label = 'batch' input_feature.dims[1].label = 'time' input_feature.dims[2].label = 'feature' target_feature.dims[0].label = 'batch' target_feature.dims[1].label = 'time' target_feature.dims[2].label = 'feature' split_dict = {'train': {'input_feature' : ( 0, num_seqs), 'target_feature': ( 0, num_seqs)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return num_seqs
def add_word_ids_to_snli(h5_file, vocab): with h5py.File(h5_file, 'a') as dst: N = len(dst['sentence1']) assert len(dst['sentence2']) == N dst.create_dataset('vocab_words', (vocab.size(), ), h5py.special_dtype(vlen=unicode)) dst.create_dataset('vocab_freqs', (vocab.size(), ), 'int64') dst['vocab_words'][:] = vocab.words dst['vocab_freqs'][:] = vocab.frequencies dtype = h5py.special_dtype(vlen=np.dtype('int32')) sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype) sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype) ### h5py nonsense ### sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes', (N, 1), dtype=("int")) sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes', (N, 1), dtype=("int")) ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ), dtype=("S20")) ### h5py nonsense ### sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence1'][:]]) sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence2'][:]]) ### h5py nonsense ### sentence1_ds_shapes[:] = np.array( [np.array(x).shape for x in dst['sentence1'][:]]) sentence2_ds_shapes[:] = np.array( [np.array(x).shape for x in dst['sentence2'][:]]) ds_shape_labels[:] = np.array(['sentence_len']) sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes') sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes) sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels') sentence1_ds.dims[0].attach_scale(ds_shape_labels) sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes') sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes) sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels') sentence2_ds.dims[0].attach_scale(ds_shape_labels) ### h5py nonsense ### dst.attrs['split'] = H5PYDataset.create_split_array({ 'all': { 'sentence1': (0, N), 'sentence2': (0, N), 'sentence1_ids': (0, N), 'sentence2_ids': (0, N), 'label': (0, N), 'text': (0, len(dst['text'])) } })
def save_hd5py(dataset, destfile, indices_dict): f = h5py.File(destfile, mode='w') images = f.create_dataset('images', dataset.shape, dtype='uint8') images[...] = dataset split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_value_error_on_unequal_sources(self): def get_subsets(): return H5PYDataset(self.h5file, which_set='train').subsets split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 15)}, 'test': {'features': (20, 30, ''), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100)}} self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) assert_raises(ValueError, get_subsets)
def test_value_error_on_unequal_sources(self): def get_subsets(): return H5PYDataset(self.h5file, which_sets=('train',)).subsets split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)}, 'test': {'features': (20, 30), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100, None, '.')}} self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) assert_raises(ValueError, get_subsets)
def load_data(train, test, s, overlap=1): train_filelist = open(train, 'r').read().split('\n') print len(train_filelist) shuffle(train_filelist) test_filelist = open(test, 'r').read().split('\n') print len(test_filelist) shuffle(test_filelist) dataset_size = len(train_filelist)+len(test_filelist) f = h5py.File('/ssd2/hmdb/hmdb-tdd.hdf5', mode='w') dtype = h5py.special_dtype(vlen=np.dtype('float32')) features = f.create_dataset('features', (dataset_size,), dtype=dtype, compression='gzip', compression_opts=7) features_shapes = f.create_dataset('features_shapes', (dataset_size,2), dtype=int) features_shape_labels = f.create_dataset('features_shape_labels', (2,), dtype='S7') labels = f.create_dataset('labels', (dataset_size,1), dtype=int, compression='gzip') for i,fn in enumerate(train_filelist+test_filelist): if len(fn) == 0: continue fn, label = fn.split(' ') fn = fn.replace('/fast-data/hmdb/image-file/', '/ssd2/hmdb/feats/') print i, fn, label labels[i] = int(label) output = np.genfromtxt(fn, skip_header=1) features[i] = output.flatten() print output.shape features_shapes[i] = output.shape features.dims.create_scale(features_shapes, 'shapes') features.dims[0].attach_scale(features_shapes) features_shape_labels[...] = ['frames'.encode('utf8'), 'channels'.encode('utf8')] features.dims.create_scale(features_shape_labels, 'shape_labels') features.dims[0].attach_scale(features_shape_labels) features.dims[0].label = 'batch' labels.dims[0].label = 'batch' trn = len(train_filelist) tst = len(test_filelist) split_dict = {'train': {'features': (0, trn), 'labels': (0, trn)}, 'test': {'features': (trn, trn+tst), 'labels': (trn, trn+tst)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_hdf5_dataset(input_filename, output_filename, batch_size=64): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #print "Sample from data: {}".format(data[70]) #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 batch_index_train = int(0.9 * train_valid_length / float(batch_size)) batch_index_valid = int(train_valid_length / float(batch_size)) batch_index_test = int(data_length / float(batch_size)) print "batch indices in order : {}".format( (batch_index_train, batch_index_valid, batch_index_test)) assert (train_valid_length == batch_index_valid * batch_size) data = data.reshape(data_length)[:batch_index_test * batch_size] data = data.reshape(batch_index_test, batch_size, 1) print data.shape print("values lost: {}").format(data_length - data.size) test_length = data_length - train_valid_length features = output_file.create_dataset(name='features', shape=data.shape, dtype='int16', data=data) features.dims[0].label = 'batch' features.dims[1].label = 'time' features.dims[2].label = 'feature' split_dict = { 'train': { 'features': (0, batch_index_train) }, 'valid': { 'features': (batch_index_train + 1, batch_index_valid) }, 'test': { 'features': (batch_index_valid + 1, batch_index_test) } } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def biblefile_to_hdf5(open_file): # TODO REMOVE LINES WITH THE BOOK OF BLABLA """Everything in one function because we have variable-length sequences, so no intermediate arrays...""" char_to_ind = {"<S>": 0, "</S>": 1} current_char_ind = 2 # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character all_verses = [] # TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence current_verse = [] for line in open_file: # first we need to check if a new verse begins somewhere in the line (not just beginning...) verse_marker_pos = find_verse_marker(line) if len(line.split()) > 0 and verse_marker_pos > -1: # if so, save the verse up to the verse marker and start a new one from the rest of the line current_verse += list(line[:verse_marker_pos]) # also replace all characters by integers, creating more mappings if necessary for (ind, char) in enumerate(current_verse): if char not in char_to_ind: char_to_ind[char] = current_char_ind current_char_ind += 1 current_verse[ind] = char_to_ind[char] current_verse.append(1) # for sequence generator we need to explicitly append this end-of-sequence char all_verses.append(numpy.array(current_verse, dtype="int32")) current_verse = list(line[verse_marker_pos:]) # otherwise, just put everything into the current verse else: current_verse += list(line) all_verses = numpy.array(all_verses) # I think this conversion is necessary for the indexing below? # at this point we have all our verses =) now we build our .hdf5 dataset # make a little validation set val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500) test_set = list(all_verses[val_indices]) train_set = list(numpy.delete(all_verses, val_indices, 0)) # if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part) f = h5py.File(name="bible.hdf5", mode="w") dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32")) character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int) character_seqs[...] = train_set + test_set split_dict = {"train": {"character_seqs": (0, len(train_set))}, "valid": {"character_seqs": (len(train_set), len(all_verses))}} f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() # we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file numpy.save("onehot_size.npy", current_char_ind) # also the word-to-index dict cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w")) # make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output # later ind_to_char = [""]*len(char_to_ind) ind_to_char[0] = "<S>" ind_to_char[1] = "</S>" for char in char_to_ind: ind_to_char[char_to_ind[char]] = char cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
def add_phonemes(): data_path = os.environ['FUEL_DATA_PATH'] data_path = os.path.join(data_path,'blizzard/') save_name = "sp_blizzard_80h_phon.hdf5" phon_file = "tbptt_blizzard_80h.hdf5" data_file = "sp_blizzard_80h.hdf5" save_path = os.path.join(data_path, save_name) phon_path = os.path.join(data_path, phon_file) data_path = os.path.join(data_path, data_file) resulth5 = h5py.File(save_path, mode='w') phonh5 = h5py.File(phon_path, mode = 'r') datah5 = h5py.File(data_path, mode = 'r') sp_h5 = resulth5.create_dataset( 'sp', (TOTAL_ROWS, 512, 257), dtype='float32') f0_h5 = resulth5.create_dataset( 'f0', (TOTAL_ROWS, 512), dtype='float32') phon_h5 = resulth5.create_dataset( 'phonemes', (TOTAL_ROWS, 512), dtype = 'int16') f0_h5[:] = datah5['f0'][:] phon_h5[:] = phonh5['phonemes'][:,::64] n_times = 100 idx = chunkIt(range(TOTAL_ROWS), n_times) for num_indx, indx in enumerate(idx): print num_indx, 100 sp_h5[indx] = datah5['sp'][indx] cont = TOTAL_ROWS end_train = int(.9*cont) end_valid = int(.95*cont) end_test = cont split_dict = { 'train': {'sp': (0, end_train), 'f0': (0, end_train), 'phonemes': (0, end_train)}, 'valid': {'sp': (end_train, end_valid), 'f0': (end_train, end_valid), 'phonemes': (end_train, end_valid)}, 'test': {'sp': (end_valid, end_test), 'f0': (end_valid, end_test), 'phonemes': (end_valid, end_test)} } resulth5.attrs['split'] = H5PYDataset.create_split_array(split_dict) resulth5.flush() resulth5.close() phonh5.close() datah5.close()
def build_hdf5_dataset(input_filename, output_filename,batch_size=64): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #print "Sample from data: {}".format(data[70]) #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 batch_index_train = int(0.9 * train_valid_length / float(batch_size)) batch_index_valid = int(train_valid_length / float(batch_size)) batch_index_test = int(data_length / float(batch_size)) print "batch indices in order : {}".format((batch_index_train, batch_index_valid, batch_index_test)) assert(train_valid_length == batch_index_valid * batch_size) data = data.reshape(data_length)[:batch_index_test*batch_size] data = data.reshape(batch_index_test,batch_size,1) print data.shape print ("values lost: {}").format(data_length - data.size) test_length = data_length - train_valid_length features = output_file.create_dataset( name='features' , shape=data.shape, dtype='int16', data=data) features.dims[0].label = 'batch' features.dims[1].label = 'time' features.dims[2].label = 'feature' split_dict = { 'train': { 'features' : (0, batch_index_train)}, 'valid': { 'features' : (batch_index_train + 1, batch_index_valid)}, 'test': { 'features' : (batch_index_valid + 1,batch_index_test)} } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def save_hd5py(dataset_dict, destfile, indices_dict): f = h5py.File(destfile, mode='w') for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k,v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_hd5py(dataset_dict, destfile, indices_dict): f = h5py.File(destfile, mode='w') for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems()) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def emboot_converter_traintrain(emboot_dataset, numpy_feature_train_file, numpy_target_train_file): train_vector_features, train_targets = load_emboot_np( numpy_feature_train_file, numpy_target_train_file) f = h5py.File(emboot_dataset, mode='w') ## Add dummy feature for channel (to be used in convolutional operator) np.expand_dims(train_vector_features, axis=1) train_sz = train_vector_features.shape[0] channel = train_vector_features.shape[1] ctx_size = train_vector_features.shape[2] embed_sz = train_vector_features.shape[3] dataset_sz = ( train_sz - 16 ) * 2 ## NOTE: 13900 * 2 (copy over the train data to the test dataset) vector_features = f.create_dataset( 'features', (dataset_sz, channel, ctx_size, embed_sz), dtype='float64') ## train + test targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') ## put the data loaded into these objects train_vector_features_rounded = train_vector_features[:13900] train_targets_rounded = train_targets[:13900] vector_features[...] = np.vstack( [train_vector_features_rounded, train_vector_features_rounded]) targets[...] = np.vstack([train_targets_rounded, train_targets_rounded]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'channel' vector_features.dims[2].label = 'word' vector_features.dims[3].label = 'embed' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': { 'features': (0, dataset_sz / 2), 'targets': (0, dataset_sz / 2) }, 'test': { 'features': (dataset_sz / 2, dataset_sz), 'targets': (dataset_sz / 2, dataset_sz) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size): data_stream = YouTubeAudio(youtube_id).get_example_stream() data_stream = Window(offset=interval_size, source_window=interval_size*window_size, target_window=interval_size*window_size, overlapping=True, data_stream=data_stream) data_iterator = data_stream.get_epoch_iterator() num_sequences = 0 for data in data_iterator: num_sequences = num_sequences + 1 output_path = '{}.hdf5'.format(hdf5_name) output_path = os.path.join(output_path) print 'total num sequences : ', num_sequences with h5py.File(output_path, mode='w') as h5file: input_feature = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16') target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16') data_iterator = data_stream.get_epoch_iterator() # for each batch for s_idx, sequence_data in enumerate(data_iterator): # get data source_data = sequence_data[0] target_data = sequence_data[1] # save data input_feature[s_idx] = source_data.reshape(window_size, interval_size) target_feature[s_idx] = target_data.reshape(window_size, interval_size) # label each dataset axis input_feature.dims[0].label = 'batch' input_feature.dims[1].label = 'time' input_feature.dims[2].label = 'feature' target_feature.dims[0].label = 'batch' target_feature.dims[1].label = 'time' target_feature.dims[2].label = 'feature' num_trains = int(num_sequences*0.8) split_dict = {'train': {'input_feature' : ( 0, num_trains), 'target_feature': ( 0, num_trains)}, 'valid': {'input_feature' : ( num_trains, num_sequences), 'target_feature': ( num_trains, num_sequences)}, } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return num_sequences
def build_hdf5_dataset_single_dim(input_filename, output_filename): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. This function outputs a single dimension for the datasets. Adapted to monk_music """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 index_train = int(0.9 * train_valid_length) index_valid = int(train_valid_length) index_test = int(data_length) print "batch indices in order : {}".format( (index_train, index_valid, index_test)) data = data.reshape((data_length)) print "Train example: {}".format(data[index_train - 100:index_train]) print "Valid example: {}".format(data[index_valid - 100:index_valid]) print "Test example: {}".format(data[index_test - 100:index_test]) features = output_file.create_dataset(name='features', shape=data.shape, dtype='int16', data=data) #features.dims[0].label = 'batch' #features.dims[0].label = 'time' features.dims[0].label = 'feature' split_dict = { 'train': { 'features': (0, index_train) }, 'valid': { 'features': (index_train + 1, index_valid) }, 'test': { 'features': (index_valid + 1, index_test) } } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def to_fuel_h5(inputs, outputs, slices, names, file_name, file_path=''): """Transforms list of numpy arrays to a structured hdf5 file Args: inputs(list): a list of inputs(numpy.arrays) outputs(list): a list of outputs(numpy.arrays) slices(list): a list of int representing the end of a slice and the begining of another slice. The last slice is automatically added if missing (maximum length of the inputs). names(list): a list of names for the datasets file_name(str): the name of the file to save. file_path(str): the path where the file is located Returns: The file full path """ import h5py import os from fuel.datasets.hdf5 import H5PYDataset suffix = 'hdf5' inp = 'input_' out = 'output_' full_path = os.path.join(file_path, file_name.lower() + '.' + suffix) f = h5py.File(full_path, mode='w') dict_data_set = dict() split_dict = dict() for name in names: split_dict[name] = dict() slices.append(max_v_len(inputs)) def insert_info_h5(iterable, suf): names_out = [] for k, v in norm_iterator(iterable): dict_data_set[suf + k] = f.create_dataset(suf + k, v.shape, v.dtype) dict_data_set[suf + k][...] = v for sl, name in zip(window(slices, 2), names): split_dict[name][suf + k] = sl names_out.append(suf + str(k)) return names_out inputs_names = insert_info_h5(inputs, inp) outputs_names = insert_info_h5(outputs, out) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() return full_path, inputs_names, outputs_names
def build_hdf5_dataset_single_dim(input_filename, output_filename): """ Builds a hdf5 dataset given the input one. The output one will have training, valid, and test as sources. This function outputs a single dimension for the datasets. Adapted to monk_music """ input_file = h5py.File(input_filename, "r") output_file = h5py.File(output_filename, "w") data = input_file["features"][:] data_length = data.shape[1] # #if not data_length % batch_size == 0: # split 0.9 0.1 0.1 train_valid_length = 160000000 index_train = int(0.9 * train_valid_length) index_valid = int(train_valid_length) index_test = int(data_length) print "batch indices in order : {}".format((index_train, index_valid, index_test)) data = data.reshape((data_length)) print "Train example: {}".format(data[index_train-100:index_train]) print "Valid example: {}".format(data[index_valid-100:index_valid]) print "Test example: {}".format(data[index_test-100:index_test]) features = output_file.create_dataset( name='features' , shape=data.shape, dtype='int16', data=data) #features.dims[0].label = 'batch' #features.dims[0].label = 'time' features.dims[0].label = 'feature' split_dict = { 'train': { 'features' : (0,index_train)}, 'valid': { 'features' : (index_train + 1,index_valid)}, 'test': { 'features' : (index_valid + 1,index_test)} } output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict) input_file.close() output_file.flush() output_file.close()
def test_pickling(self): try: features = numpy.arange(360, dtype='uint8').reshape((10, 36)) h5file = h5py.File('file.hdf5', mode='w') h5file['features'] = features split_dict = {'train': {'features': (0, 10, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = cPickle.loads( cPickle.dumps(H5PYDataset(h5file, which_set='train'))) assert dataset.data_sources is None finally: os.remove('file.hdf5')
def convert2hdf5(self): imageLst = [] index = self.lastIndex while True: if (index % 10000 == 0): print("Current Index: ", index) image = self.images[index] try: imgObj = Image.open(image).convert('RGB') except: continue imgObj = imgObj.resize((self.image_width, self.image_width)) img = np.asarray(imgObj) if img.shape == (self.image_width, self.image_width, 3): imageLst.append([img]) index += 1 if index >= self.numExamples: break imgObj.close() anime_npy = np.vstack(imageLst).astype('uint8') anime_npy = anime_npy.transpose(0, 3, 1, 2) f = h5py.File('anime_faces.hdf5', mode='w') anime_faces = f.create_dataset('features', anime_npy.shape, dtype='uint8') split_dict = { 'train': { 'features': (0, index) }, 'test': { 'features': (0, index) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) anime_faces[...] = anime_npy f.flush() f.close()
def fuel_converter(fuel_dataset, embeddings_train, labels_train, embeddings_test, labels_test): f = h5py.File(fuel_dataset, mode='w') labels_train = np.expand_dims(labels_train, axis=1) labels_test = np.expand_dims(labels_test, axis=1) train_sz = embeddings_train.shape[0] - embeddings_train.shape[0] % 100 test_sz = embeddings_test.shape[0] - embeddings_test.shape[0] % 100 feat_sz = embeddings_train.shape[1] dataset_sz = train_sz + test_sz print("Actual Train size : ", embeddings_train.shape[0]) print("Train size in Fuel : ", train_sz) print("Actual Test size : ", embeddings_test.shape[0]) print("Test size in Fuel : ", test_sz) vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64') targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8') ## put the data loaded into these objects vector_features[...] = np.vstack( [embeddings_train[0:train_sz], embeddings_test[0:test_sz]]) targets[...] = np.vstack( [labels_train[0:train_sz], labels_test[0:test_sz]]) ## label the dims with names vector_features.dims[0].label = 'batch' vector_features.dims[1].label = 'feature' targets.dims[0].label = 'batch' targets.dims[1].label = 'index' ## split attribute -- way to recover the splits # creating the split using an API split_dict = { 'train': { 'features': (0, train_sz), 'targets': (0, train_sz) }, 'test': { 'features': (train_sz, dataset_sz), 'targets': (train_sz, dataset_sz) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_pickles(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') pickle.loads(pickle.dumps(dataset)) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def output_hdf5(path_list, output_root_dir): num_data = len(path_list) shapes = [] dirs = output_root_dir.split('\\') file_name = dirs[-1] + '.hdf5' output_root_dir = os.path.join(output_root_dir, file_name) f = h5py.File(output_root_dir, mode='w') dtype = h5py.special_dtype(vlen=np.dtype('uint8')) image_features = f.create_dataset('image_features', (num_data,), dtype=dtype) image_features.dims[0].label = 'batch' try: for i in tqdm.tqdm(range(num_data)): image = io.imread(path_list[i]) shapes.append(image.shape) image_features[i] = image.flatten() shapes = np.array(shapes).astype(np.int32) image_features_shapes = f.create_dataset('image_features_shapes', (num_data, 3), dtype=np.int32) image_features_shapes[...] = shapes image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset( 'image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = [ 'height'.encode('utf8'), 'width'.encode('utf8'), 'channel'.encode('utf8')] image_features.dims.create_scale( image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) # specify the splits split_train = (0, num_data) split_dict = dict(train=dict(image_features=split_train)) f.attrs["split"] = H5PYDataset.create_split_array(split_dict) except KeyboardInterrupt: print "割り込み停止が実行されました" f.flush() f.close()
def setUp(self): self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36)) self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1)) self.h5file = h5py.File( 'file.hdf5', mode='w', driver='core', backing_store=False) self.h5file['features'] = self.features self.h5file['features'].dims[0].label = 'batch' self.h5file['features'].dims[1].label = 'feature' self.h5file['targets'] = self.targets self.h5file['targets'].dims[0].label = 'batch' self.h5file['targets'].dims[1].label = 'index' split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)}, 'test': {'features': (20, 30, ''), 'targets': (20, 30)}, 'unlabeled': {'features': (30, 100)}} self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def add_phonemes(): data_path = os.environ["FUEL_DATA_PATH"] data_path = os.path.join(data_path, "blizzard/") save_name = "sp_blizzard_80h_phon.hdf5" phon_file = "tbptt_blizzard_80h.hdf5" data_file = "sp_blizzard_80h.hdf5" save_path = os.path.join(data_path, save_name) phon_path = os.path.join(data_path, phon_file) data_path = os.path.join(data_path, data_file) resulth5 = h5py.File(save_path, mode="w") phonh5 = h5py.File(phon_path, mode="r") datah5 = h5py.File(data_path, mode="r") sp_h5 = resulth5.create_dataset("sp", (TOTAL_ROWS, 512, 257), dtype="float32") f0_h5 = resulth5.create_dataset("f0", (TOTAL_ROWS, 512), dtype="float32") phon_h5 = resulth5.create_dataset("phonemes", (TOTAL_ROWS, 512), dtype="int16") f0_h5[:] = datah5["f0"][:] phon_h5[:] = phonh5["phonemes"][:, ::64] n_times = 100 idx = chunkIt(range(TOTAL_ROWS), n_times) for num_indx, indx in enumerate(idx): print num_indx, 100 sp_h5[indx] = datah5["sp"][indx] cont = TOTAL_ROWS end_train = int(0.9 * cont) end_valid = int(0.95 * cont) end_test = cont split_dict = { "train": {"sp": (0, end_train), "f0": (0, end_train), "phonemes": (0, end_train)}, "valid": {"sp": (end_train, end_valid), "f0": (end_train, end_valid), "phonemes": (end_train, end_valid)}, "test": {"sp": (end_valid, end_test), "f0": (end_valid, end_test), "phonemes": (end_valid, end_test)}, } resulth5.attrs["split"] = H5PYDataset.create_split_array(split_dict) resulth5.flush() resulth5.close() phonh5.close() datah5.close()
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds): indices_dict = indices_dict_or_numfolds if isinstance(indices_dict, int): folds = indices_dict n = max(len(it) for it in dataset_dict.values()) fold_n = n // folds indices_dict = dict(("fold_{}".format(i), (i * fold_n, (i + 1) * fold_n)) for i in range(folds)) print indices_dict f = h5py.File(destfile, mode="w") for name, dataset in dataset_dict.iteritems(): dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype)) dat[...] = dataset split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems()) f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_axis_labels(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features.dims[0].label = 'batch' features.dims[1].label = 'feature' features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') assert dataset.axis_labels == {'features': ('batch', 'feature')} finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def test_pickling(self): try: features = numpy.arange(360, dtype='uint16').reshape((10, 36)) h5file = h5py.File('file.hdf5', mode='w') h5file['features'] = features split_dict = {'train': {'features': (0, 10, None, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = cPickle.loads( cPickle.dumps(H5PYDataset(h5file, which_sets=('train',)))) # Make sure _out_of_memory_{open,close} accesses # external_file_handle rather than _external_file_handle dataset._out_of_memory_open() dataset._out_of_memory_close() assert dataset.data_sources is None finally: os.remove('file.hdf5')
def test_pickling(self): try: features = numpy.arange(360, dtype='uint16').reshape((10, 36)) h5file = h5py.File('file.hdf5', mode='w') h5file['features'] = features split_dict = {'train': {'features': (0, 10, None, '.')}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) dataset = cPickle.loads( cPickle.dumps(H5PYDataset(h5file, which_sets=('train', )))) # Make sure _out_of_memory_{open,close} accesses # external_file_handle rather than _external_file_handle dataset._out_of_memory_open() dataset._out_of_memory_close() assert dataset.data_sources is None finally: os.remove('file.hdf5')
def infer(path, ae_encode): ''' :param path: path of infer data :param ae_encode: compiled theano function :return: image saved path in string ''' hf = h5py.File(path, 'r+') split_dict = { 'test': { 'input': (0, 1), 'target': (0, 1) }, } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) test_set = H5PYDataset(path, which_sets=('test', )) batch_size = 1 test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) for te_train, te_target in test_stream.get_epoch_iterator(): break te_out, te_ta = ae_encode(input_transform(te_train), target_transform(te_target)) te_reshape = inverse(te_out) te_target_reshape = inverse(te_ta) new_size = (128 * 2, 160) new_im = Image.new('RGB', new_size) r = np.random.choice(1, 1, replace=False).reshape(1, 1) for i in range(1): for j in range(1): index = r[i][j] target_im = Image.fromarray(te_target_reshape[index]) train_im = Image.fromarray(te_train[index].astype(np.uint8)) im = Image.fromarray(te_reshape[index]) new_im.paste(train_im, (128 * (i * 2), 160 * j)) new_im.paste(im, (128 * (i * 2 + 1), 160 * j)) img_loc = "gen_images/%i.png" % int(time()) new_im.save(img_loc) return img_loc
def text_to_h5py_dataset(text_path, dst_path): # The simplest is to load everything to memory first. # If memory becomes an issue, this code can be optimized. words = [] with open(text_path, 'r') as src: for line in src: words.extend(line.strip().split()) with h5py.File(dst_path, 'w') as dst: dtype = h5py.special_dtype(vlen=bytes) table = dst.create_dataset('words', (len(words), ), dtype=dtype) table[:] = words dst.attrs['split'] = H5PYDataset.create_split_array( {'train': { 'words': (0, len(words)) }})
def data(): try: hf["target"].shape except: hf = h5py.File('faces.hdf5', 'r+') num_samples = hf["input"].shape[0] print "number of samples in dataset : %i" % num_samples split_dict = { 'train': { 'input': (2000, num_samples), 'target': (2000, num_samples) }, 'test': { 'input': (0, 1000), 'target': (0, 1000) }, 'val': { 'input': (1000, 2000), 'target': (1000, 2000) } } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) train_set = H5PYDataset('faces.hdf5', which_sets=('train', )) test_set = H5PYDataset('faces.hdf5', which_sets=('test', )) val_set = H5PYDataset('faces.hdf5', which_sets=('val', )) batch_size = 128 #TODO : use shuffledscheme instead? Seems slower, might have screwed up the chunksize in the HDF5 files? tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size) tr_stream = DataStream(train_set, iteration_scheme=tr_scheme) val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size) val_stream = DataStream(val_set, iteration_scheme=val_scheme) test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) hf.close() return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
def gen_vlen_dataset(): import h5py from fuel.datasets.hdf5 import H5PYDataset sizes = numpy.random.randint(3, 9, size=(100,)) train_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]] test_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]] f = h5py.File('dataset_vlen.h5', mode='w') f['vector_features'] = numpy.vstack([numpy.load('train_vector_features.npy'), numpy.load('test_vector_features.npy')]) f['targets'] = numpy.vstack([numpy.load('train_targets.npy'), numpy.load('test_targets.npy')]) f['vector_features'].dims[0].label = 'batch' f['vector_features'].dims[1].label = 'feature' f['targets'].dims[0].label = 'batch' f['targets'].dims[1].label = 'index' all_image_features = train_image_features + test_image_features dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) image_features = f.create_dataset('image_features', (100,), dtype=dtype) image_features[...] = [image.flatten() for image in all_image_features] image_features.dims[0].label = 'batch' image_features_shapes = f.create_dataset('image_features_shapes', (100, 3), dtype='int32') image_features_shapes[...] = numpy.array([image.shape for image in all_image_features]) image_features.dims.create_scale(image_features_shapes, 'shapes') image_features.dims[0].attach_scale(image_features_shapes) image_features_shape_labels = f.create_dataset('image_features_shape_labels', (3,), dtype='S7') image_features_shape_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] image_features.dims.create_scale(image_features_shape_labels, 'shape_labels') image_features.dims[0].attach_scale(image_features_shape_labels) split_dict = {'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)}, 'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() train_set = H5PYDataset('dataset_vlen.h5', which_sets=('train',), sources=('image_features',)) print(train_set.axis_labels['image_features']) handle = train_set.open() images, = train_set.get_data(handle, slice(0, 10)) train_set.close(handle) print(images[0].shape, images[1].shape, images[2].shape, images[3].shape)
def make_lsun_dataset(scene_path, fuel_hdf5_path, resize_shape): # get image list image_list = [] for root, dirs, files in os.walk(scene_path): for filename in fnmatch.filter(files, "*.jpg"): image_list.append(os.path.join(root, filename)) num_images = len(image_list) print "num of images :{}".format(num_images) # open image file fuel_file = h5py.File(name=fuel_hdf5_path, mode="w") # set new dataset for fuel file image_data = fuel_file.create_dataset(name="image_data", shape=(num_images, 3) + resize_shape, dtype="uint8") for idx, filepath in enumerate(image_list): original_image = Image.open(filepath).convert("RGB") resize_row = resize_shape[0] if original_image.size[0] < original_image.size[1] else original_image.size[0] resize_col = resize_shape[1] if original_image.size[0] > original_image.size[1] else original_image.size[1] original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS) if original_image.size[0] != resize_shape[0]: excess = (original_image.size[0] - resize_shape[0]) / 2 original_image = original_image.crop((excess, 0, resize_shape[0] + excess, resize_shape[0])) elif original_image.size[1] != resize_shape[1]: excess = (original_image.size[1] - resize_shape[1]) / 2 original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1] + excess)) original_image = numpy.asarray(original_image) image_data[idx] = numpy.transpose(original_image, (2, 0, 1)) image_data.dims[0].label = "batch" image_data.dims[1].label = "channel" image_data.dims[2].label = "height" image_data.dims[3].label = "width" split_dict = {"train": {"image_data": (0, num_images)}} fuel_file.attrs["split"] = H5PYDataset.create_split_array(split_dict) fuel_file.flush() fuel_file.close() print "DONE : {} (num of images :{})".format(fuel_hdf5_path, num_images)
def test_h5py_dataset_out_of_memory_unsorted_indices(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset( path='tmp.hdf5', which_set='train', load_in_memory=False, sort_indices=False) handle = dataset.open() assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5]) finally: dataset.close(handle) if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def add_split_dict(hdf5file, names, total_examples, train_frac=0.83, valid_frac=0.10): # TODO: investiage the "reference" stuff so we can pluck validation # and testing events evenly from the sample final_train_index = int(total_examples * train_frac) final_valid_index = int(total_examples * (train_frac + valid_frac)) train_dict = {name: (0, final_train_index) for name in names} valid_dict = {name: (final_train_index, final_valid_index) for name in names} test_dict = {name: (final_valid_index, total_examples) for name in names} split_dict = { 'train': train_dict, 'valid': valid_dict, 'test': test_dict } hdf5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def make_celeb_dataset(fuel_hdf5_path, resize_shape): # get image list image_list = glob.glob(CELEBA_FACE_FOLDER + '*.jpg') num_images = len(image_list) # open image file fuel_file = h5py.File(name=fuel_hdf5_path, mode='w') # set new dataset for fuel file image_data = fuel_file.create_dataset(name='image_data', shape=(num_images, 3) + resize_shape, dtype='uint8') for idx, filepath in enumerate(image_list): original_image = Image.open(filepath).convert('RGB') resize_row = resize_shape[0] if original_image.size[0]<original_image.size[1] else original_image.size[0] resize_col = resize_shape[1] if original_image.size[0]>original_image.size[1] else original_image.size[1] original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS) if original_image.size[0] != resize_shape[0]: excess = (original_image.size[0] - resize_shape[0]) / 2 original_image = original_image.crop((excess, 0, resize_shape[0]+excess, resize_shape[0])) elif original_image.size[1] != resize_shape[1]: excess = (original_image.size[1] - resize_shape[1]) / 2 original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1]+excess)) original_image = numpy.asarray(original_image) image_data[idx] = numpy.transpose(original_image, (2, 0, 1)) image_data.dims[0].label = 'batch' image_data.dims[1].label = 'channel' image_data.dims[2].label = 'height' image_data.dims[3].label = 'width' split_dict = { 'train' : {'image_data': (0, num_images)}} fuel_file .attrs['split'] = H5PYDataset.create_split_array(split_dict) fuel_file.flush() fuel_file.close() print 'DONE : {} (num of images :{})'.format(fuel_hdf5_path, num_images)
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N): hdf5name = 'mushrooms.hdf5' f = h5py.File(hdf5name, mode='w') fx = f.create_dataset('x', np_enc_data.shape, dtype='float32') fy = f.create_dataset('y', np_enc_y.shape, dtype='int64') fx[...] = np_enc_data fy[...] = np_enc_y split_dict = { 'train': {'x': (0,splitpoint), 'y': (0, splitpoint)}, 'test': {'x': (splitpoint, N), 'y': (splitpoint, N)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_h5py_dataset_out_of_memory(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset( path='tmp.hdf5', which_set='train', load_in_memory=False) handle = dataset.open() assert_equal( dataset.get_data(state=handle, request=slice(0, 10))[0], numpy.arange(50).reshape((10, 5))) dataset.close(handle) finally: if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def save_hd5py (out_path, data, folds = 0): images = np.concatenate([a[0] for a in data], axis = 0) labels = np.concatenate([a[1] for a in data], axis = 0) f = h5py.File(out_path, mode='w') ds = f.create_dataset('images', images.shape, dtype=str(images.dtype)) ds[...] = images ds = f.create_dataset('labels', labels.shape, dtype=str(labels.dtype)) ds[...] = labels #assert(folds > 1) #if folds > 1: fold = len(images) // folds idx = {'fold-{}'.format(i): (i*fold, (i+1)*fold) for i in range(folds)} print idx split_dict = {k: {'images': v, 'labels':v} for k, v in idx.iteritems()} from fuel.datasets.hdf5 import H5PYDataset f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() pass