コード例 #1
0
ファイル: test_hdf5.py プロジェクト: Commonlibs/fuel
    def setUp(self):
        self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36))
        self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
        h5file = h5py.File(
            'file.hdf5', mode='w', driver='core', backing_store=False)
        h5file['features'] = self.features
        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'feature'
        h5file['targets'] = self.targets
        h5file['targets'].dims[0].label = 'batch'
        h5file['targets'].dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)},
                      'test': {'features': (20, 30), 'targets': (20, 30)},
                      'unlabeled': {'features': (30, 100, None, '.')}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.h5file = h5file

        vlen_h5file = h5py.File(
            'test_vl.hdf5', mode='w', driver='core', backing_store=False)
        self.vlen_features = [
            numpy.arange(12, dtype='uint8').reshape((3, 2, 2)),
            numpy.arange(48, dtype='uint8').reshape((3, 4, 4)),
            numpy.arange(60, dtype='uint8').reshape((3, 5, 4)),
            numpy.arange(18, dtype='uint8').reshape((3, 2, 3))]
        self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1))
        dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
        features = vlen_h5file.create_dataset('features', (4,), dtype=dtype)
        features[...] = [d.flatten() for d in self.vlen_features]
        features.dims[0].label = 'batch'
        features_shapes = vlen_h5file.create_dataset(
            'features_shapes', (4, 3), dtype='uint8')
        features_shapes[...] = numpy.array(
            [d.shape for d in self.vlen_features])
        features.dims.create_scale(features_shapes, 'shapes')
        features.dims[0].attach_scale(features_shapes)
        features_shape_labels = vlen_h5file.create_dataset(
            'features_shape_labels', (3,), dtype='S7')
        features_shape_labels[...] = [
            'channel'.encode('utf8'), 'height'.encode('utf8'),
            'width'.encode('utf8')]
        features.dims.create_scale(features_shape_labels, 'shape_labels')
        features.dims[0].attach_scale(features_shape_labels)
        targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8')
        targets[...] = self.vlen_targets
        targets.dims[0].label = 'batch'
        targets.dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}}
        vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.vlen_h5file = vlen_h5file
コード例 #2
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
    def setUp(self):
        self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36))
        self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
        h5file = h5py.File(
            'file.hdf5', mode='w', driver='core', backing_store=False)
        h5file['features'] = self.features
        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'feature'
        h5file['targets'] = self.targets
        h5file['targets'].dims[0].label = 'batch'
        h5file['targets'].dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)},
                      'test': {'features': (20, 30, ''), 'targets': (20, 30)},
                      'unlabeled': {'features': (30, 100)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.h5file = h5file

        vlen_h5file = h5py.File(
            'test_vl.hdf5', mode='w', driver='core', backing_store=False)
        self.vlen_features = [
            numpy.arange(12, dtype='uint8').reshape((3, 2, 2)),
            numpy.arange(48, dtype='uint8').reshape((3, 4, 4)),
            numpy.arange(60, dtype='uint8').reshape((3, 5, 4)),
            numpy.arange(18, dtype='uint8').reshape((3, 2, 3))]
        self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1))
        dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
        features = vlen_h5file.create_dataset('features', (4,), dtype=dtype)
        features[...] = [d.flatten() for d in self.vlen_features]
        features.dims[0].label = 'batch'
        features_shapes = vlen_h5file.create_dataset(
            'features_shapes', (4, 3), dtype='uint8')
        features_shapes[...] = numpy.array(
            [d.shape for d in self.vlen_features])
        features.dims.create_scale(features_shapes, 'shapes')
        features.dims[0].attach_scale(features_shapes)
        features_shape_labels = vlen_h5file.create_dataset(
            'features_shape_labels', (3,), dtype='S7')
        features_shape_labels[...] = [
            'channel'.encode('utf8'), 'height'.encode('utf8'),
            'width'.encode('utf8')]
        features.dims.create_scale(features_shape_labels, 'shape_labels')
        features.dims[0].attach_scale(features_shape_labels)
        targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8')
        targets[...] = self.vlen_targets
        targets.dims[0].label = 'batch'
        targets.dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}}
        vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.vlen_h5file = vlen_h5file
コード例 #3
0
ファイル: dataset.py プロジェクト: sharpfun/NeverEndingMusic
def createH5Dataset(hdf5_out, corpus_path, sequence_length):
    with open(corpus_path) as f:
        corpus = f.read().split(",")

    (indices, vocab) = pd.factorize(list(corpus))

    instances_num = len(corpus) // (sequence_length + 1)

    f = h5py.File(hdf5_out, mode='w')

    train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8)
    train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8)

    for j in range(instances_num):
        for i in range(sequence_length):
            train_data_x[j][i] = indices[i + j * (sequence_length + 1)]
            train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1]

    char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8')
    char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8')

    char_in[...] = train_data_x
    char_out[...] = train_data_y

    split_dict = {
        'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.attrs["vocab"] = json.dumps(list(vocab))

    f.flush()
    f.close()
コード例 #4
0
def emboot_converter_traintrain(emboot_dataset):
    train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np()
    f = h5py.File(emboot_dataset, mode='w')

    train_sz = train_vector_features.shape[0]
    test_sz = test_vector_features.shape[0]
    feat_sz = train_vector_features.shape[1]
    dataset_sz = (train_sz + test_sz) * 2 ## NOTE: 67000 * 2 (copy over the train data to the test dataset)

    vector_features = f.create_dataset('features', (dataset_sz, feat_sz), dtype='float64')  ## train + test
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    train_vector_features_aug = np.vstack([train_vector_features, test_vector_features])
    train_targets_aug = np.vstack([train_targets, test_targets])

    ## put the data loaded into these objects
    vector_features[...] = np.vstack([train_vector_features_aug, train_vector_features_aug])
    targets[...] = np.vstack([train_targets_aug, train_targets_aug])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {'features': (0, dataset_sz/2), 'targets': (0, dataset_sz/2)},
         'test': {'features': (dataset_sz/2, dataset_sz), 'targets': (dataset_sz/2, dataset_sz)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #5
0
ファイル: test_hdf5.py プロジェクト: jeanmarcosdarosa/fuel
def test_h5py_dataset_out_of_memory():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        targets = h5file.create_dataset('targets', (10, 1), dtype='float32')
        targets[...] = numpy.arange(10, dtype='float32').reshape((10, 1))
        split_dict = {
            'train': {
                'features': (0, 5),
                'targets': (0, 5)
            },
            'test': {
                'features': (5, 10),
                'targets': (5, 10)
            }
        }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5',
                              which_set='test',
                              load_in_memory=False)
        handle = dataset.open()
        assert_equal(
            dataset.get_data(state=handle, request=slice(3, 5))[1],
            numpy.arange(10).reshape((10, 1))[8:10])
    finally:
        dataset.close(handle)
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #6
0
ファイル: test_hdf5.py プロジェクト: kastnerkyle/fuel
def test_h5py_dataset_split():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 8)},
                      'test': {'features': (8, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
        test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
        train_handle = train_set.open()
        test_handle = test_set.open()
        assert_equal(
            train_set.get_data(state=train_handle, request=slice(0, 8))[0],
            numpy.arange(50).reshape((10, 5))[:8])
        assert_equal(
            test_set.get_data(state=test_handle, request=slice(0, 2))[0],
            numpy.arange(50).reshape((10, 5))[8:])
        train_set.close(train_handle)
        test_set.close(test_handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #7
0
def add_sets(args):
    with h5py.File(args.h5file, "a") as h5file:
        sources = []
        for dataset in h5file:
            if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"):
                continue
            sources.append(dataset)

        uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])}

        split_dict = {}
        for subset in args.sets:
            name, uttids_fname = subset.split("=")
            idxs = []
            with open(uttids_fname) as uf:
                for l in uf:
                    uttid = l.strip().split()[0]
                    idxs.append(uttid2idx[uttid])

            indices_name = "{}_indices".format(name)

            if indices_name in h5file:
                del h5file[indices_name]

            #
            # Note: ideally, we would sort the indeces and do:
            # h5file[indices_name] = numpy.array(sorted(idxs))
            # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
            #
            h5file[indices_name] = numpy.array(idxs)
            indices_ref = h5file[indices_name].ref
            split_dict[name] = {source: (-1, -1, indices_ref) for source in sources}

        h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
コード例 #8
0
ファイル: test_hdf5.py プロジェクト: xiaoyexixi/fuel
 def setUp(self):
     self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36))
     self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
     self.h5file = h5py.File('file.hdf5',
                             mode='w',
                             driver='core',
                             backing_store=False)
     self.h5file['features'] = self.features
     self.h5file['features'].dims[0].label = 'batch'
     self.h5file['features'].dims[1].label = 'feature'
     self.h5file['targets'] = self.targets
     self.h5file['targets'].dims[0].label = 'batch'
     self.h5file['targets'].dims[1].label = 'index'
     split_dict = {
         'train': {
             'features': (0, 20, '.'),
             'targets': (0, 20)
         },
         'test': {
             'features': (20, 30, ''),
             'targets': (20, 30)
         },
         'unlabeled': {
             'features': (30, 100)
         }
     }
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
コード例 #9
0
 def test_index_split_in_memory(self):
     features = numpy.arange(50, dtype='uint8').reshape((10, 5))
     h5file = h5py.File('index_split.hdf5',
                        mode='w',
                        driver='core',
                        backing_store=False)
     h5file['features'] = features
     h5file['features'].dims[0].label = 'batch'
     h5file['features'].dims[1].label = 'feature'
     h5file['train_features_subset'] = numpy.arange(0, 10, 2)
     h5file['test_features_subset'] = numpy.arange(1, 10, 2)
     train_ref = h5file['train_features_subset'].ref
     test_ref = h5file['test_features_subset'].ref
     split_dict = {
         'train': {
             'features': (-1, -1, train_ref, '.')
         },
         'test': {
             'features': (-1, -1, test_ref, '')
         }
     }
     h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     dataset = H5PYDataset(h5file,
                           which_sets=('train', ),
                           load_in_memory=True)
     handle = dataset.open()
     request = slice(0, 5)
     assert_equal(dataset.get_data(handle, request)[0], features[0:10:2])
     assert_equal(dataset.num_examples, 5)
     dataset.close(handle)
コード例 #10
0
ファイル: test_hdf5.py プロジェクト: nagyist/fuel
def test_h5py_dataset_split():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {
            'train': {
                'features': (0, 8)
            },
            'test': {
                'features': (8, 10)
            }
        }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
        test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
        train_handle = train_set.open()
        test_handle = test_set.open()
        assert_equal(
            train_set.get_data(state=train_handle, request=slice(0, 8))[0],
            numpy.arange(50).reshape((10, 5))[:8])
        assert_equal(
            test_set.get_data(state=test_handle, request=slice(0, 2))[0],
            numpy.arange(50).reshape((10, 5))[8:])
        train_set.close(train_handle)
        test_set.close(test_handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #11
0
ファイル: test_hdf5.py プロジェクト: nagyist/fuel
def test_h5py_dataset_split_parsing():
    try:
        h5file = h5py.File('tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (100, 36), dtype='uint8')
        features[...] = numpy.zeros(shape=(100, 36)).astype('uint8')
        targets = h5file.create_dataset('targets', (30, 1), dtype='uint8')
        targets[...] = numpy.zeros(shape=(30, 1)).astype('uint8')
        split_dict = {
            'train': {
                'features': (0, 20),
                'targets': (0, 20)
            },
            'test': {
                'features': (20, 30),
                'targets': (20, 30)
            },
            'unlabeled': {
                'features': (30, 100)
            }
        }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
        assert train_set.provides_sources == ('features', 'targets')
        test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
        assert test_set.provides_sources == ('features', 'targets')
        unlabeled_set = H5PYDataset(path='tmp.hdf5', which_set='unlabeled')
        assert unlabeled_set.provides_sources == ('features', )
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #12
0
ファイル: data.py プロジェクト: ccienfall/Face-Verification
def data():

    try:
        hf["target"].shape
    except:
        hf = h5py.File('faces.hdf5','r+')
    num_samples = hf["input"].shape[0]

    print "number of samples in dataset : %i" %num_samples

    split_dict = {
         'train': {'input': (2000, num_samples), 'target': (2000, num_samples)},
         'test': {'input': (0, 1000), 'target': (0, 1000)},
         'val': {'input': (1000, 2000), 'target': (1000, 2000)}
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    train_set = H5PYDataset('faces.hdf5', which_sets=('train',))
    test_set = H5PYDataset('faces.hdf5', which_sets=('test',))
    val_set = H5PYDataset('faces.hdf5', which_sets=('val',))

    batch_size = 128

#TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

    tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size)
    tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

    val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size)
    val_stream = DataStream(val_set, iteration_scheme=val_scheme)

    test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)
    hf.close()
    return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
コード例 #13
0
def CreateHDF5():
    sizes = numpy.random.randint(3,9, size=(100,))
    train_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[:90]]
    test_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[90:]]

    train_vector_features = numpy.random.normal(size=(90,10)).astype('float32')
    test_vector_features = numpy.random.normal(size=(10,10)).astype('float32')
    train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8')
    test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8')

    f = h5py.File('dataset.hdf5', mode='w')
    vector_features = f.create_dataset(
         'vector_features', (100, 10), dtype='float32')
    targets = f.create_dataset(
         'targets', (100, 1), dtype='uint8')

    vector_features[...] = numpy.vstack(
         [train_vector_features, test_vector_features])
    targets[...] = numpy.vstack([train_targets, test_targets])


    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    all_image_features = train_image_features + test_image_features
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    image_features = f.create_dataset('image_features', (100,), dtype=dtype)
    image_features[...] = [image.flatten() for image in all_image_features]
    image_features.dims[0].label='batch'

    image_features_shapes = f.create_dataset(
         'image_features_shapes', (100, 3), dtype='int32')
    image_features_shapes[...] = numpy.array(
         [image.shape for image in all_image_features])
    image_features.dims.create_scale(image_features_shapes, 'shapes')
    image_features.dims[0].attach_scale(image_features_shapes)

    image_features_shape_labels = f.create_dataset(
         'image_features_shape_labels', (3,), dtype='S7')
    image_features_shape_labels[...] = [
         'channel'.encode('utf8'), 'height'.encode('utf8'),
         'width'.encode('utf8')]
    image_features.dims.create_scale(
         image_features_shape_labels, 'shape_labels')
    image_features.dims[0].attach_scale(image_features_shape_labels)

    split_dict = {
         'train': {'vector_features': (0, 90), 'image_features': (0, 90),
                   'targets': (0, 90)},
         'test': {'vector_features': (90, 100), 'image_features': (90, 100),
                  'targets': (90, 100)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
コード例 #14
0
    def save_h5py(tn, start, stop):
        cf = train_features[start:stop]
        ct = train_targets[start:stop]
        np.save(pjoin(numpy_path, prefix + tn + '_features.npy'), cf)
        np.save(pjoin(numpy_path, prefix + tn + '_targets.npy'), ct)
        h5 = h5py.File(pjoin(fuel_path, prefix + tn + '.hdf5'), mode='w')
        h5_features = h5.create_dataset('features',
                                        (cf.shape[0], cf.shape[1] * mult),
                                        dtype='float32')
        lenf = stop - start
        with ProgressBar(maxval=lenf) as progbar:
            for i in range(lenf):
                arr = []
                for j in range(-concat[0], concat[0] + 1, concat[1]):
                    arr.extend(cf[(i - j) % lenf])
                h5_features[i] = np.asarray(arr)
                progbar.update(i)

        h5_targets = h5.create_dataset('targets', ct.shape, dtype='uint16')
        h5_targets[...] = ct
        h5_features.dims[0].label = 'batch'
        h5_features.dims[1].label = 'feature'
        h5_targets.dims[0].label = 'batch'
        h5_targets.dims[1].label = 'index'

        split_dict = {
            tn: {
                'features': (0, stop - start),
                'targets': (0, stop - start)
            },
            #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))},
        }
        h5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5.flush()
        h5.close()
コード例 #15
0
ファイル: kaldi2fuel.py プロジェクト: Aditay/attention-lvcsr
def add_sets(args):
    with h5py.File(args.h5file, 'a') as h5file:
        sources = []
        for dataset in h5file:
            if (dataset.endswith('_indices') or dataset.endswith('_shapes') or
                dataset.endswith('_shape_labels')):
                continue
            sources.append(dataset)

        uttid2idx = {uttid:idx for (idx,uttid) in enumerate(h5file['uttids']) }

        split_dict = {}
        for subset in args.sets:
            name, uttids_fname = subset.split('=')
            idxs = []
            with open(uttids_fname) as uf:
                for l in uf:
                    uttid = l.strip().split()[0]
                    idxs.append(uttid2idx[uttid])

            indices_name = '{}_indices'.format(name)

            if indices_name in h5file:
                del h5file[indices_name]

            #
            # Note: ideally, we would sort the indeces and do:
            # h5file[indices_name] = numpy.array(sorted(idxs))
            # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
            #
            h5file[indices_name] = numpy.array(idxs)
            indices_ref =  h5file[indices_name].ref
            split_dict[name] = {source : (-1, -1, indices_ref) for source in sources}

        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
コード例 #16
0
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N):

    hdf5name = 'mushrooms.hdf5'
    f = h5py.File(hdf5name, mode='w')

    fx = f.create_dataset('x', np_enc_data.shape, dtype='float32')
    fy = f.create_dataset('y', np_enc_y.shape, dtype='int64')

    fx[...] = np_enc_data
    fy[...] = np_enc_y

    split_dict = {
        'train': {
            'x': (0, splitpoint),
            'y': (0, splitpoint)
        },
        'test': {
            'x': (splitpoint, N),
            'y': (splitpoint, N)
        }
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #17
0
ファイル: init_fuel.py プロジェクト: AlphaLambdaMuPi/DLAlpha
    def save_h5py(tn, start, stop):
        cf = train_features[start:stop]
        ct = train_targets[start:stop]
        np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf)
        np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct)
        h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w')
        h5_features = h5.create_dataset(
            'features', (cf.shape[0], cf.shape[1]*mult)
            , dtype='float32')
        lenf = stop - start
        with ProgressBar(maxval=lenf) as progbar:
            for i in range(lenf):
                arr = []
                for j in range(-concat[0], concat[0]+1, concat[1]):
                    arr.extend(cf[(i-j)%lenf])
                h5_features[i] = np.asarray(arr)
                progbar.update(i)

        h5_targets = h5.create_dataset(
            'targets', ct.shape, dtype='uint16')
        h5_targets[...] = ct
        h5_features.dims[0].label = 'batch'
        h5_features.dims[1].label = 'feature'
        h5_targets.dims[0].label = 'batch'
        h5_targets.dims[1].label = 'index'

        split_dict = {
            tn: {'features': (0, stop-start), 'targets': (0, stop-start)},
            #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))},
        }
        h5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5.flush()
        h5.close()
コード例 #18
0
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size):
    [rate, signal] = wav.read(wav_name)
    num_steps   = signal.shape[0]
    num_seqs    = num_steps-window_size
    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)

    signal = signal.reshape(num_steps,1)
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16')
        print ' num of sequences : {}'.format(num_seqs)
        for s in xrange(num_seqs):
            input_feature[s]  = signal[s:s+window_size]
            target_feature[s] = signal[(s+1):(s+1)+window_size]

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        split_dict = {'train': {'input_feature' : ( 0,  num_seqs),
                                'target_feature': ( 0,  num_seqs)}}

        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_seqs
コード例 #19
0
def add_word_ids_to_snli(h5_file, vocab):
    with h5py.File(h5_file, 'a') as dst:
        N = len(dst['sentence1'])
        assert len(dst['sentence2']) == N

        dst.create_dataset('vocab_words', (vocab.size(), ),
                           h5py.special_dtype(vlen=unicode))
        dst.create_dataset('vocab_freqs', (vocab.size(), ), 'int64')
        dst['vocab_words'][:] = vocab.words
        dst['vocab_freqs'][:] = vocab.frequencies

        dtype = h5py.special_dtype(vlen=np.dtype('int32'))
        sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype)
        sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype)

        ### h5py nonsense ###
        sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes',
                                                 (N, 1),
                                                 dtype=("int"))
        sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes',
                                                 (N, 1),
                                                 dtype=("int"))
        ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ),
                                             dtype=("S20"))
        ### h5py nonsense ###

        sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s]
                                    for s in dst['sentence1'][:]])
        sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s]
                                    for s in dst['sentence2'][:]])

        ### h5py nonsense ###
        sentence1_ds_shapes[:] = np.array(
            [np.array(x).shape for x in dst['sentence1'][:]])
        sentence2_ds_shapes[:] = np.array(
            [np.array(x).shape for x in dst['sentence2'][:]])
        ds_shape_labels[:] = np.array(['sentence_len'])

        sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes')
        sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes)
        sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
        sentence1_ds.dims[0].attach_scale(ds_shape_labels)

        sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes')
        sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes)
        sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels')
        sentence2_ds.dims[0].attach_scale(ds_shape_labels)
        ### h5py nonsense ###

        dst.attrs['split'] = H5PYDataset.create_split_array({
            'all': {
                'sentence1': (0, N),
                'sentence2': (0, N),
                'sentence1_ids': (0, N),
                'sentence2_ids': (0, N),
                'label': (0, N),
                'text': (0, len(dst['text']))
            }
        })
コード例 #20
0
 def save_hd5py(dataset, destfile, indices_dict):
     f = h5py.File(destfile, mode='w')
     images = f.create_dataset('images', dataset.shape, dtype='uint8')
     images[...] = dataset
     split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems())
     f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     f.flush()
     f.close()
コード例 #21
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_value_error_on_unequal_sources(self):
     def get_subsets():
         return H5PYDataset(self.h5file, which_set='train').subsets
     split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 15)},
                   'test': {'features': (20, 30, ''), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100)}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     assert_raises(ValueError, get_subsets)
コード例 #22
0
ファイル: test_hdf5.py プロジェクト: Commonlibs/fuel
 def test_value_error_on_unequal_sources(self):
     def get_subsets():
         return H5PYDataset(self.h5file, which_sets=('train',)).subsets
     split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)},
                   'test': {'features': (20, 30), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100, None, '.')}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     assert_raises(ValueError, get_subsets)
コード例 #23
0
 def save_hd5py(dataset, destfile, indices_dict):
     f = h5py.File(destfile, mode='w')
     images = f.create_dataset('images', dataset.shape, dtype='uint8')
     images[...] = dataset
     split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems())
     f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     f.flush()
     f.close()
コード例 #24
0
def load_data(train, test, s, overlap=1):
    train_filelist = open(train, 'r').read().split('\n')
    print len(train_filelist)
    shuffle(train_filelist)

    test_filelist = open(test, 'r').read().split('\n')
    print len(test_filelist)
    shuffle(test_filelist)

    dataset_size = len(train_filelist)+len(test_filelist)


    f = h5py.File('/ssd2/hmdb/hmdb-tdd.hdf5', mode='w')
    dtype = h5py.special_dtype(vlen=np.dtype('float32'))  
    features = f.create_dataset('features', (dataset_size,), dtype=dtype, compression='gzip', compression_opts=7)
    features_shapes = f.create_dataset('features_shapes', (dataset_size,2), dtype=int)
    features_shape_labels = f.create_dataset('features_shape_labels', (2,), dtype='S7')

    labels = f.create_dataset('labels', (dataset_size,1), dtype=int, compression='gzip')

    for i,fn in enumerate(train_filelist+test_filelist):
        if len(fn) == 0:
            continue
        fn, label = fn.split(' ')
        fn = fn.replace('/fast-data/hmdb/image-file/', '/ssd2/hmdb/feats/')
        print i, fn, label

        labels[i] = int(label)

        output = np.genfromtxt(fn, skip_header=1)

        features[i] = output.flatten()
        print output.shape
        features_shapes[i] = output.shape
        
    features.dims.create_scale(features_shapes, 'shapes')
    features.dims[0].attach_scale(features_shapes)
    
    features_shape_labels[...] = ['frames'.encode('utf8'), 'channels'.encode('utf8')]

    features.dims.create_scale(features_shape_labels, 'shape_labels')
    features.dims[0].attach_scale(features_shape_labels)

    features.dims[0].label = 'batch'
    labels.dims[0].label = 'batch'


    trn = len(train_filelist)
    tst = len(test_filelist)
    split_dict = {'train': {'features': (0, trn),
                            'labels': (0, trn)},
                  'test': {'features': (trn, trn+tst),
                           'labels': (trn, trn+tst)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #25
0
def build_hdf5_dataset(input_filename, output_filename, batch_size=64):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1]  #

    #print "Sample from data: {}".format(data[70])
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    batch_index_train = int(0.9 * train_valid_length / float(batch_size))
    batch_index_valid = int(train_valid_length / float(batch_size))
    batch_index_test = int(data_length / float(batch_size))

    print "batch indices in order : {}".format(
        (batch_index_train, batch_index_valid, batch_index_test))

    assert (train_valid_length == batch_index_valid * batch_size)

    data = data.reshape(data_length)[:batch_index_test * batch_size]
    data = data.reshape(batch_index_test, batch_size, 1)
    print data.shape

    print("values lost: {}").format(data_length - data.size)
    test_length = data_length - train_valid_length

    features = output_file.create_dataset(name='features',
                                          shape=data.shape,
                                          dtype='int16',
                                          data=data)

    features.dims[0].label = 'batch'
    features.dims[1].label = 'time'
    features.dims[2].label = 'feature'

    split_dict = {
        'train': {
            'features': (0, batch_index_train)
        },
        'valid': {
            'features': (batch_index_train + 1, batch_index_valid)
        },
        'test': {
            'features': (batch_index_valid + 1, batch_index_test)
        }
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
コード例 #26
0
def biblefile_to_hdf5(open_file):  # TODO REMOVE LINES WITH THE BOOK OF BLABLA
    """Everything in one function because we have variable-length sequences, so no intermediate arrays..."""
    char_to_ind = {"<S>": 0, "</S>": 1}
    current_char_ind = 2  # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character
    all_verses = []
    # TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence
    current_verse = []
    for line in open_file:
        # first we need to check if a new verse begins somewhere in the line (not just beginning...)
        verse_marker_pos = find_verse_marker(line)
        if len(line.split()) > 0 and verse_marker_pos > -1:
            # if so, save the verse up to the verse marker and start a new one from the rest of the line
            current_verse += list(line[:verse_marker_pos])
            # also replace all characters by integers, creating more mappings if necessary
            for (ind, char) in enumerate(current_verse):
                if char not in char_to_ind:
                    char_to_ind[char] = current_char_ind
                    current_char_ind += 1
                current_verse[ind] = char_to_ind[char]
            current_verse.append(1)  # for sequence generator we need to explicitly append this end-of-sequence char
            all_verses.append(numpy.array(current_verse, dtype="int32"))
            current_verse = list(line[verse_marker_pos:])
        # otherwise, just put everything into the current verse
        else:
            current_verse += list(line)
    all_verses = numpy.array(all_verses)  # I think this conversion is necessary for the indexing below?

    # at this point we have all our verses =) now we build our .hdf5 dataset
    # make a little validation set
    val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500)
    test_set = list(all_verses[val_indices])
    train_set = list(numpy.delete(all_verses, val_indices, 0))

    # if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part)
    f = h5py.File(name="bible.hdf5", mode="w")
    dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32"))
    character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int)
    character_seqs[...] = train_set + test_set

    split_dict = {"train": {"character_seqs": (0, len(train_set))},
                  "valid": {"character_seqs": (len(train_set), len(all_verses))}}
    f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    # we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file
    numpy.save("onehot_size.npy", current_char_ind)
    # also the word-to-index dict
    cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w"))
    # make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output
    # later
    ind_to_char = [""]*len(char_to_ind)
    ind_to_char[0] = "<S>"
    ind_to_char[1] = "</S>"
    for char in char_to_ind:
        ind_to_char[char_to_ind[char]] = char
    cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
コード例 #27
0
ファイル: blizzard_mgc_80h.py プロジェクト: anirudh9119/play
def add_phonemes():
    data_path = os.environ['FUEL_DATA_PATH']
    data_path = os.path.join(data_path,'blizzard/')
    save_name = "sp_blizzard_80h_phon.hdf5"
    phon_file = "tbptt_blizzard_80h.hdf5"
    data_file = "sp_blizzard_80h.hdf5"

    save_path = os.path.join(data_path, save_name)
    phon_path = os.path.join(data_path, phon_file)
    data_path = os.path.join(data_path, data_file)

    resulth5 = h5py.File(save_path, mode='w')
    phonh5 = h5py.File(phon_path, mode = 'r')
    datah5 = h5py.File(data_path, mode = 'r')

    sp_h5 = resulth5.create_dataset(
                'sp', (TOTAL_ROWS, 512, 257), dtype='float32')
    f0_h5 = resulth5.create_dataset(
                'f0', (TOTAL_ROWS, 512), dtype='float32')

    phon_h5 = resulth5.create_dataset(
                'phonemes', (TOTAL_ROWS, 512), dtype = 'int16')

    f0_h5[:] = datah5['f0'][:]
    phon_h5[:] = phonh5['phonemes'][:,::64]

    n_times = 100
    idx = chunkIt(range(TOTAL_ROWS), n_times)

    for num_indx, indx in enumerate(idx):
        print num_indx, 100
        sp_h5[indx] = datah5['sp'][indx]

    cont = TOTAL_ROWS
    end_train = int(.9*cont)
    end_valid = int(.95*cont)
    end_test = cont

    split_dict = {
        'train': {'sp': (0, end_train),
                  'f0': (0, end_train),
                  'phonemes': (0, end_train)},
        'valid': {'sp': (end_train, end_valid),
                  'f0': (end_train, end_valid),
                  'phonemes': (end_train, end_valid)},
        'test': {'sp': (end_valid, end_test),
                 'f0': (end_valid, end_test),
                 'phonemes': (end_valid, end_test)}
        }

    resulth5.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    resulth5.flush()
    resulth5.close()

    phonh5.close()
    datah5.close()
コード例 #28
0
def build_hdf5_dataset(input_filename, output_filename,batch_size=64):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #

    #print "Sample from data: {}".format(data[70])
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    batch_index_train = int(0.9 * train_valid_length / float(batch_size))
    batch_index_valid = int(train_valid_length / float(batch_size))
    batch_index_test = int(data_length / float(batch_size))

    print "batch indices in order : {}".format((batch_index_train,
                                                batch_index_valid,
                                                batch_index_test))

    assert(train_valid_length == batch_index_valid * batch_size)

    data = data.reshape(data_length)[:batch_index_test*batch_size]
    data = data.reshape(batch_index_test,batch_size,1)
    print data.shape

    print ("values lost: {}").format(data_length - data.size)
    test_length = data_length - train_valid_length

    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    features.dims[0].label = 'batch'
    features.dims[1].label = 'time'
    features.dims[2].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0, batch_index_train)},
        'valid': {
            'features' : (batch_index_train + 1, batch_index_valid)},
        'test': {
            'features' : (batch_index_valid + 1,batch_index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
コード例 #29
0
ファイル: utils.py プロジェクト: StevenLOL/video_predict
def save_hd5py(dataset_dict, destfile, indices_dict):
    f = h5py.File(destfile, mode='w')
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
            for k,v in indices_dict.iteritems())
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
コード例 #30
0
ファイル: utils.py プロジェクト: tencia/experiments
def save_hd5py(dataset_dict, destfile, indices_dict):
    f = h5py.File(destfile, mode='w')
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
                      for k, v in indices_dict.iteritems())
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
コード例 #31
0
def emboot_converter_traintrain(emboot_dataset, numpy_feature_train_file,
                                numpy_target_train_file):
    train_vector_features, train_targets = load_emboot_np(
        numpy_feature_train_file, numpy_target_train_file)
    f = h5py.File(emboot_dataset, mode='w')

    ## Add dummy feature for channel (to be used in convolutional operator)
    np.expand_dims(train_vector_features, axis=1)

    train_sz = train_vector_features.shape[0]
    channel = train_vector_features.shape[1]
    ctx_size = train_vector_features.shape[2]
    embed_sz = train_vector_features.shape[3]
    dataset_sz = (
        train_sz - 16
    ) * 2  ## NOTE: 13900 * 2 (copy over the train data to the test dataset)

    vector_features = f.create_dataset(
        'features', (dataset_sz, channel, ctx_size, embed_sz),
        dtype='float64')  ## train + test
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    ## put the data loaded into these objects

    train_vector_features_rounded = train_vector_features[:13900]
    train_targets_rounded = train_targets[:13900]

    vector_features[...] = np.vstack(
        [train_vector_features_rounded, train_vector_features_rounded])
    targets[...] = np.vstack([train_targets_rounded, train_targets_rounded])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'channel'
    vector_features.dims[2].label = 'word'
    vector_features.dims[3].label = 'embed'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {
            'features': (0, dataset_sz / 2),
            'targets': (0, dataset_sz / 2)
        },
        'test': {
            'features': (dataset_sz / 2, dataset_sz),
            'targets': (dataset_sz / 2, dataset_sz)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #32
0
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size):
    data_stream = YouTubeAudio(youtube_id).get_example_stream()

    data_stream = Window(offset=interval_size,
                         source_window=interval_size*window_size,
                         target_window=interval_size*window_size,
                         overlapping=True,
                         data_stream=data_stream)

    data_iterator = data_stream.get_epoch_iterator()

    num_sequences = 0
    for data in data_iterator:
        num_sequences = num_sequences + 1

    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)
    print 'total num sequences : ', num_sequences
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16')

        data_iterator = data_stream.get_epoch_iterator()
        # for each batch
        for s_idx, sequence_data in enumerate(data_iterator):
            # get data
            source_data = sequence_data[0]
            target_data = sequence_data[1]

            # save data
            input_feature[s_idx]  = source_data.reshape(window_size, interval_size)
            target_feature[s_idx]  = target_data.reshape(window_size, interval_size)

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        num_trains = int(num_sequences*0.8)

        split_dict = {'train': {'input_feature' : ( 0,  num_trains),
                                'target_feature': ( 0,  num_trains)},
                      'valid': {'input_feature' : ( num_trains,  num_sequences),
                                'target_feature': ( num_trains,  num_sequences)},
                      }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_sequences
コード例 #33
0
def build_hdf5_dataset_single_dim(input_filename, output_filename):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    This function outputs a single dimension for the datasets.
    Adapted to monk_music
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1]  #
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    index_train = int(0.9 * train_valid_length)
    index_valid = int(train_valid_length)
    index_test = int(data_length)

    print "batch indices in order : {}".format(
        (index_train, index_valid, index_test))

    data = data.reshape((data_length))

    print "Train example: {}".format(data[index_train - 100:index_train])
    print "Valid example: {}".format(data[index_valid - 100:index_valid])
    print "Test example: {}".format(data[index_test - 100:index_test])

    features = output_file.create_dataset(name='features',
                                          shape=data.shape,
                                          dtype='int16',
                                          data=data)

    #features.dims[0].label = 'batch'
    #features.dims[0].label = 'time'
    features.dims[0].label = 'feature'

    split_dict = {
        'train': {
            'features': (0, index_train)
        },
        'valid': {
            'features': (index_train + 1, index_valid)
        },
        'test': {
            'features': (index_valid + 1, index_test)
        }
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
コード例 #34
0
def to_fuel_h5(inputs, outputs, slices, names, file_name, file_path=''):
    """Transforms list of numpy arrays to a structured hdf5 file


    Args:
        inputs(list): a list of inputs(numpy.arrays)
        outputs(list): a list of outputs(numpy.arrays)
        slices(list): a list of int representing the end of a slice and the
            begining of another slice. The last slice is automatically added
            if missing (maximum length of the inputs).
        names(list): a list of names for the datasets
        file_name(str): the name of the file to save.
        file_path(str): the path where the file is located

    Returns:
        The file full path
    """
    import h5py
    import os
    from fuel.datasets.hdf5 import H5PYDataset

    suffix = 'hdf5'

    inp = 'input_'
    out = 'output_'

    full_path = os.path.join(file_path, file_name.lower() + '.' + suffix)
    f = h5py.File(full_path, mode='w')

    dict_data_set = dict()
    split_dict = dict()
    for name in names:
        split_dict[name] = dict()

    slices.append(max_v_len(inputs))

    def insert_info_h5(iterable, suf):
        names_out = []
        for k, v in norm_iterator(iterable):
            dict_data_set[suf + k] = f.create_dataset(suf + k, v.shape,
                                                      v.dtype)
            dict_data_set[suf + k][...] = v
            for sl, name in zip(window(slices, 2), names):
                split_dict[name][suf + k] = sl
            names_out.append(suf + str(k))
        return names_out

    inputs_names = insert_info_h5(inputs, inp)
    outputs_names = insert_info_h5(outputs, out)

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    return full_path, inputs_names, outputs_names
コード例 #35
0
def build_hdf5_dataset_single_dim(input_filename, output_filename):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    This function outputs a single dimension for the datasets.
    Adapted to monk_music
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    index_train = int(0.9 * train_valid_length)
    index_valid = int(train_valid_length)
    index_test = int(data_length)

    print "batch indices in order : {}".format((index_train,
                                                index_valid,
                                                index_test))

    data = data.reshape((data_length))

    print "Train example: {}".format(data[index_train-100:index_train])
    print "Valid example: {}".format(data[index_valid-100:index_valid])
    print "Test example: {}".format(data[index_test-100:index_test])


    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    #features.dims[0].label = 'batch'
    #features.dims[0].label = 'time'
    features.dims[0].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0,index_train)},
        'valid': {
            'features' : (index_train + 1,index_valid)},
        'test': {
            'features' : (index_valid + 1,index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
コード例 #36
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_pickling(self):
     try:
         features = numpy.arange(360, dtype='uint8').reshape((10, 36))
         h5file = h5py.File('file.hdf5', mode='w')
         h5file['features'] = features
         split_dict = {'train': {'features': (0, 10, '.')}}
         h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
         dataset = cPickle.loads(
             cPickle.dumps(H5PYDataset(h5file, which_set='train')))
         assert dataset.data_sources is None
     finally:
         os.remove('file.hdf5')
コード例 #37
0
ファイル: hdf5_converter.py プロジェクト: ParsonsZeng/BGAN-1
    def convert2hdf5(self):

        imageLst = []

        index = self.lastIndex

        while True:

            if (index % 10000 == 0):
                print("Current Index: ", index)

            image = self.images[index]
            try:
                imgObj = Image.open(image).convert('RGB')
            except:
                continue

            imgObj = imgObj.resize((self.image_width, self.image_width))
            img = np.asarray(imgObj)
            if img.shape == (self.image_width, self.image_width, 3):
                imageLst.append([img])

            index += 1
            if index >= self.numExamples:
                break

            imgObj.close()

        anime_npy = np.vstack(imageLst).astype('uint8')

        anime_npy = anime_npy.transpose(0, 3, 1, 2)

        f = h5py.File('anime_faces.hdf5', mode='w')

        anime_faces = f.create_dataset('features',
                                       anime_npy.shape,
                                       dtype='uint8')

        split_dict = {
            'train': {
                'features': (0, index)
            },
            'test': {
                'features': (0, index)
            }
        }

        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        anime_faces[...] = anime_npy

        f.flush()
        f.close()
コード例 #38
0
ファイル: readWikiData.py プロジェクト: mithunpaul08/ladder
def fuel_converter(fuel_dataset, embeddings_train, labels_train,
                   embeddings_test, labels_test):
    f = h5py.File(fuel_dataset, mode='w')

    labels_train = np.expand_dims(labels_train, axis=1)
    labels_test = np.expand_dims(labels_test, axis=1)

    train_sz = embeddings_train.shape[0] - embeddings_train.shape[0] % 100
    test_sz = embeddings_test.shape[0] - embeddings_test.shape[0] % 100
    feat_sz = embeddings_train.shape[1]
    dataset_sz = train_sz + test_sz

    print("Actual Train size : ", embeddings_train.shape[0])
    print("Train size in Fuel : ", train_sz)

    print("Actual Test size : ", embeddings_test.shape[0])
    print("Test size in Fuel : ", test_sz)

    vector_features = f.create_dataset('features', (dataset_sz, feat_sz),
                                       dtype='float64')
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    ## put the data loaded into these objects
    vector_features[...] = np.vstack(
        [embeddings_train[0:train_sz], embeddings_test[0:test_sz]])
    targets[...] = np.vstack(
        [labels_train[0:train_sz], labels_test[0:test_sz]])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {
            'features': (0, train_sz),
            'targets': (0, train_sz)
        },
        'test': {
            'features': (train_sz, dataset_sz),
            'targets': (train_sz, dataset_sz)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #39
0
ファイル: test_hdf5.py プロジェクト: kastnerkyle/fuel
def test_h5py_dataset_pickles():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        pickle.loads(pickle.dumps(dataset))
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #40
0
ファイル: test_hdf5.py プロジェクト: nagyist/fuel
def test_h5py_dataset_pickles():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        pickle.loads(pickle.dumps(dataset))
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #41
0
def output_hdf5(path_list, output_root_dir):
    num_data = len(path_list)
    shapes = []

    dirs = output_root_dir.split('\\')
    file_name = dirs[-1] + '.hdf5'
    output_root_dir = os.path.join(output_root_dir, file_name)

    f = h5py.File(output_root_dir, mode='w')
    dtype = h5py.special_dtype(vlen=np.dtype('uint8'))
    image_features = f.create_dataset('image_features',
                                      (num_data,),
                                      dtype=dtype)

    image_features.dims[0].label = 'batch'

    try:
        for i in tqdm.tqdm(range(num_data)):
            image = io.imread(path_list[i])
            shapes.append(image.shape)
            image_features[i] = image.flatten()

        shapes = np.array(shapes).astype(np.int32)
        image_features_shapes = f.create_dataset('image_features_shapes',
                                                 (num_data, 3),
                                                 dtype=np.int32)
        image_features_shapes[...] = shapes

        image_features.dims.create_scale(image_features_shapes, 'shapes')
        image_features.dims[0].attach_scale(image_features_shapes)

        image_features_shape_labels = f.create_dataset(
            'image_features_shape_labels', (3,), dtype='S7')
        image_features_shape_labels[...] = [
             'height'.encode('utf8'), 'width'.encode('utf8'),
             'channel'.encode('utf8')]
        image_features.dims.create_scale(
            image_features_shape_labels, 'shape_labels')
        image_features.dims[0].attach_scale(image_features_shape_labels)

        # specify the splits
        split_train = (0, num_data)
        split_dict = dict(train=dict(image_features=split_train))
        f.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    except KeyboardInterrupt:
        print "割り込み停止が実行されました"

    f.flush()
    f.close()
コード例 #42
0
ファイル: test_hdf5.py プロジェクト: cosmoharrigan/fuel
 def setUp(self):
     self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36))
     self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
     self.h5file = h5py.File(
         'file.hdf5', mode='w', driver='core', backing_store=False)
     self.h5file['features'] = self.features
     self.h5file['features'].dims[0].label = 'batch'
     self.h5file['features'].dims[1].label = 'feature'
     self.h5file['targets'] = self.targets
     self.h5file['targets'].dims[0].label = 'batch'
     self.h5file['targets'].dims[1].label = 'index'
     split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)},
                   'test': {'features': (20, 30, ''), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100)}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
コード例 #43
0
ファイル: blizzard_mgc_80h.py プロジェクト: anirudh9119/play
def add_phonemes():
    data_path = os.environ["FUEL_DATA_PATH"]
    data_path = os.path.join(data_path, "blizzard/")
    save_name = "sp_blizzard_80h_phon.hdf5"
    phon_file = "tbptt_blizzard_80h.hdf5"
    data_file = "sp_blizzard_80h.hdf5"

    save_path = os.path.join(data_path, save_name)
    phon_path = os.path.join(data_path, phon_file)
    data_path = os.path.join(data_path, data_file)

    resulth5 = h5py.File(save_path, mode="w")
    phonh5 = h5py.File(phon_path, mode="r")
    datah5 = h5py.File(data_path, mode="r")

    sp_h5 = resulth5.create_dataset("sp", (TOTAL_ROWS, 512, 257), dtype="float32")
    f0_h5 = resulth5.create_dataset("f0", (TOTAL_ROWS, 512), dtype="float32")

    phon_h5 = resulth5.create_dataset("phonemes", (TOTAL_ROWS, 512), dtype="int16")

    f0_h5[:] = datah5["f0"][:]
    phon_h5[:] = phonh5["phonemes"][:, ::64]

    n_times = 100
    idx = chunkIt(range(TOTAL_ROWS), n_times)

    for num_indx, indx in enumerate(idx):
        print num_indx, 100
        sp_h5[indx] = datah5["sp"][indx]

    cont = TOTAL_ROWS
    end_train = int(0.9 * cont)
    end_valid = int(0.95 * cont)
    end_test = cont

    split_dict = {
        "train": {"sp": (0, end_train), "f0": (0, end_train), "phonemes": (0, end_train)},
        "valid": {"sp": (end_train, end_valid), "f0": (end_train, end_valid), "phonemes": (end_train, end_valid)},
        "test": {"sp": (end_valid, end_test), "f0": (end_valid, end_test), "phonemes": (end_valid, end_test)},
    }

    resulth5.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    resulth5.flush()
    resulth5.close()

    phonh5.close()
    datah5.close()
コード例 #44
0
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds):
    indices_dict = indices_dict_or_numfolds
    if isinstance(indices_dict, int):
        folds = indices_dict
        n = max(len(it) for it in dataset_dict.values())
        fold_n = n // folds
        indices_dict = dict(("fold_{}".format(i), (i * fold_n, (i + 1) * fold_n)) for i in range(folds))
        print indices_dict
    f = h5py.File(destfile, mode="w")
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems())
    f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
コード例 #45
0
ファイル: test_hdf5.py プロジェクト: kastnerkyle/fuel
def test_h5py_dataset_axis_labels():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features.dims[0].label = 'batch'
        features.dims[1].label = 'feature'
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        assert dataset.axis_labels == {'features': ('batch', 'feature')}
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #46
0
ファイル: test_hdf5.py プロジェクト: nagyist/fuel
def test_h5py_dataset_axis_labels():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features.dims[0].label = 'batch'
        features.dims[1].label = 'feature'
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        assert dataset.axis_labels == {'features': ('batch', 'feature')}
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #47
0
ファイル: test_hdf5.py プロジェクト: Commonlibs/fuel
 def test_pickling(self):
     try:
         features = numpy.arange(360, dtype='uint16').reshape((10, 36))
         h5file = h5py.File('file.hdf5', mode='w')
         h5file['features'] = features
         split_dict = {'train': {'features': (0, 10, None, '.')}}
         h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
         dataset = cPickle.loads(
             cPickle.dumps(H5PYDataset(h5file, which_sets=('train',))))
         # Make sure _out_of_memory_{open,close} accesses
         # external_file_handle rather than _external_file_handle
         dataset._out_of_memory_open()
         dataset._out_of_memory_close()
         assert dataset.data_sources is None
     finally:
         os.remove('file.hdf5')
コード例 #48
0
 def test_pickling(self):
     try:
         features = numpy.arange(360, dtype='uint16').reshape((10, 36))
         h5file = h5py.File('file.hdf5', mode='w')
         h5file['features'] = features
         split_dict = {'train': {'features': (0, 10, None, '.')}}
         h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
         dataset = cPickle.loads(
             cPickle.dumps(H5PYDataset(h5file, which_sets=('train', ))))
         # Make sure _out_of_memory_{open,close} accesses
         # external_file_handle rather than _external_file_handle
         dataset._out_of_memory_open()
         dataset._out_of_memory_close()
         assert dataset.data_sources is None
     finally:
         os.remove('file.hdf5')
コード例 #49
0
def infer(path, ae_encode):
    '''
    :param path: path of infer data
    :param ae_encode: compiled theano function
    :return: image saved path in string
    '''

    hf = h5py.File(path, 'r+')

    split_dict = {
        'test': {
            'input': (0, 1),
            'target': (0, 1)
        },
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    test_set = H5PYDataset(path, which_sets=('test', ))

    batch_size = 1

    test_scheme = SequentialScheme(examples=test_set.num_examples,
                                   batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)

    for te_train, te_target in test_stream.get_epoch_iterator():
        break
    te_out, te_ta = ae_encode(input_transform(te_train),
                              target_transform(te_target))
    te_reshape = inverse(te_out)
    te_target_reshape = inverse(te_ta)

    new_size = (128 * 2, 160)
    new_im = Image.new('RGB', new_size)
    r = np.random.choice(1, 1, replace=False).reshape(1, 1)
    for i in range(1):
        for j in range(1):
            index = r[i][j]

            target_im = Image.fromarray(te_target_reshape[index])
            train_im = Image.fromarray(te_train[index].astype(np.uint8))
            im = Image.fromarray(te_reshape[index])

            new_im.paste(train_im, (128 * (i * 2), 160 * j))
            new_im.paste(im, (128 * (i * 2 + 1), 160 * j))
    img_loc = "gen_images/%i.png" % int(time())
    new_im.save(img_loc)
    return img_loc
コード例 #50
0
def text_to_h5py_dataset(text_path, dst_path):
    # The simplest is to load everything to memory first.
    # If memory becomes an issue, this code can be optimized.
    words = []
    with open(text_path, 'r') as src:
        for line in src:
            words.extend(line.strip().split())

    with h5py.File(dst_path, 'w') as dst:
        dtype = h5py.special_dtype(vlen=bytes)
        table = dst.create_dataset('words', (len(words), ), dtype=dtype)
        table[:] = words

        dst.attrs['split'] = H5PYDataset.create_split_array(
            {'train': {
                'words': (0, len(words))
            }})
コード例 #51
0
ファイル: data.py プロジェクト: ccienfall/Face-Verification
def data():

    try:
        hf["target"].shape
    except:
        hf = h5py.File('faces.hdf5', 'r+')
    num_samples = hf["input"].shape[0]

    print "number of samples in dataset : %i" % num_samples

    split_dict = {
        'train': {
            'input': (2000, num_samples),
            'target': (2000, num_samples)
        },
        'test': {
            'input': (0, 1000),
            'target': (0, 1000)
        },
        'val': {
            'input': (1000, 2000),
            'target': (1000, 2000)
        }
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    train_set = H5PYDataset('faces.hdf5', which_sets=('train', ))
    test_set = H5PYDataset('faces.hdf5', which_sets=('test', ))
    val_set = H5PYDataset('faces.hdf5', which_sets=('val', ))

    batch_size = 128

    #TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

    tr_scheme = SequentialScheme(examples=train_set.num_examples,
                                 batch_size=batch_size)
    tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

    val_scheme = SequentialScheme(examples=val_set.num_examples,
                                  batch_size=batch_size)
    val_stream = DataStream(val_set, iteration_scheme=val_scheme)

    test_scheme = SequentialScheme(examples=test_set.num_examples,
                                   batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)
    hf.close()
    return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
コード例 #52
0
ファイル: test_fuel.py プロジェクト: zhaobozb/test-fuel
def gen_vlen_dataset():
    import h5py
    from fuel.datasets.hdf5 import H5PYDataset

    sizes = numpy.random.randint(3, 9, size=(100,))
    train_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]]
    test_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]]

    f = h5py.File('dataset_vlen.h5', mode='w')
    f['vector_features'] = numpy.vstack([numpy.load('train_vector_features.npy'), numpy.load('test_vector_features.npy')])
    f['targets'] = numpy.vstack([numpy.load('train_targets.npy'), numpy.load('test_targets.npy')])

    f['vector_features'].dims[0].label = 'batch'
    f['vector_features'].dims[1].label = 'feature'
    f['targets'].dims[0].label = 'batch'
    f['targets'].dims[1].label = 'index'

    all_image_features = train_image_features + test_image_features
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    image_features = f.create_dataset('image_features', (100,), dtype=dtype)
    image_features[...] = [image.flatten() for image in all_image_features]
    image_features.dims[0].label = 'batch'

    image_features_shapes = f.create_dataset('image_features_shapes', (100, 3), dtype='int32')
    image_features_shapes[...] = numpy.array([image.shape for image in all_image_features])

    image_features.dims.create_scale(image_features_shapes, 'shapes')
    image_features.dims[0].attach_scale(image_features_shapes)

    image_features_shape_labels = f.create_dataset('image_features_shape_labels', (3,), dtype='S7')
    image_features_shape_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')]
    image_features.dims.create_scale(image_features_shape_labels, 'shape_labels')
    image_features.dims[0].attach_scale(image_features_shape_labels)

    split_dict = {'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)},
                  'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    train_set = H5PYDataset('dataset_vlen.h5', which_sets=('train',), sources=('image_features',))
    print(train_set.axis_labels['image_features'])
    handle = train_set.open()
    images, = train_set.get_data(handle, slice(0, 10))
    train_set.close(handle)
    print(images[0].shape, images[1].shape, images[2].shape, images[3].shape)
コード例 #53
0
def make_lsun_dataset(scene_path, fuel_hdf5_path, resize_shape):

    # get image list
    image_list = []
    for root, dirs, files in os.walk(scene_path):
        for filename in fnmatch.filter(files, "*.jpg"):
            image_list.append(os.path.join(root, filename))
    num_images = len(image_list)

    print "num of images :{}".format(num_images)

    # open image file
    fuel_file = h5py.File(name=fuel_hdf5_path, mode="w")

    # set new dataset for fuel file
    image_data = fuel_file.create_dataset(name="image_data", shape=(num_images, 3) + resize_shape, dtype="uint8")

    for idx, filepath in enumerate(image_list):
        original_image = Image.open(filepath).convert("RGB")
        resize_row = resize_shape[0] if original_image.size[0] < original_image.size[1] else original_image.size[0]
        resize_col = resize_shape[1] if original_image.size[0] > original_image.size[1] else original_image.size[1]
        original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS)

        if original_image.size[0] != resize_shape[0]:
            excess = (original_image.size[0] - resize_shape[0]) / 2
            original_image = original_image.crop((excess, 0, resize_shape[0] + excess, resize_shape[0]))
        elif original_image.size[1] != resize_shape[1]:
            excess = (original_image.size[1] - resize_shape[1]) / 2
            original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1] + excess))

        original_image = numpy.asarray(original_image)
        image_data[idx] = numpy.transpose(original_image, (2, 0, 1))

    image_data.dims[0].label = "batch"
    image_data.dims[1].label = "channel"
    image_data.dims[2].label = "height"
    image_data.dims[3].label = "width"

    split_dict = {"train": {"image_data": (0, num_images)}}
    fuel_file.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    fuel_file.flush()
    fuel_file.close()

    print "DONE : {} (num of images :{})".format(fuel_hdf5_path, num_images)
コード例 #54
0
ファイル: test_hdf5.py プロジェクト: jeanmarcosdarosa/fuel
def test_h5py_dataset_out_of_memory_unsorted_indices():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(
            path='tmp.hdf5', which_set='train', load_in_memory=False,
            sort_indices=False)
        handle = dataset.open()
        assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5])
    finally:
        dataset.close(handle)
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #55
0
def add_split_dict(hdf5file, names, total_examples,
                   train_frac=0.83, valid_frac=0.10):
    # TODO: investiage the "reference" stuff so we can pluck validation
    # and testing events evenly from the sample
    final_train_index = int(total_examples * train_frac)
    final_valid_index = int(total_examples * (train_frac + valid_frac))

    train_dict = {name: (0, final_train_index)
                  for name in names}
    valid_dict = {name: (final_train_index, final_valid_index)
                  for name in names}
    test_dict = {name: (final_valid_index, total_examples)
                 for name in names}
    split_dict = {
        'train': train_dict,
        'valid': valid_dict,
        'test': test_dict
    }
    hdf5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
コード例 #56
0
ファイル: make_dataset.py プロジェクト: taesupkim/dcgan_code
def make_celeb_dataset(fuel_hdf5_path,
                       resize_shape):
    # get image list
    image_list = glob.glob(CELEBA_FACE_FOLDER + '*.jpg')
    num_images = len(image_list)

    # open image file
    fuel_file     = h5py.File(name=fuel_hdf5_path,
                              mode='w')

    # set new dataset for fuel file
    image_data = fuel_file.create_dataset(name='image_data',
                                          shape=(num_images, 3) + resize_shape,
                                          dtype='uint8')

    for idx, filepath in enumerate(image_list):
        original_image = Image.open(filepath).convert('RGB')
        resize_row = resize_shape[0] if original_image.size[0]<original_image.size[1] else original_image.size[0]
        resize_col = resize_shape[1] if original_image.size[0]>original_image.size[1] else original_image.size[1]
        original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS)

        if original_image.size[0] != resize_shape[0]:
            excess = (original_image.size[0] - resize_shape[0]) / 2
            original_image = original_image.crop((excess, 0, resize_shape[0]+excess, resize_shape[0]))
        elif original_image.size[1] != resize_shape[1]:
            excess = (original_image.size[1] - resize_shape[1]) / 2
            original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1]+excess))

        original_image = numpy.asarray(original_image)
        image_data[idx] = numpy.transpose(original_image, (2, 0, 1))

    image_data.dims[0].label = 'batch'
    image_data.dims[1].label = 'channel'
    image_data.dims[2].label = 'height'
    image_data.dims[3].label = 'width'

    split_dict = { 'train' : {'image_data': (0, num_images)}}
    fuel_file .attrs['split'] = H5PYDataset.create_split_array(split_dict)

    fuel_file.flush()
    fuel_file.close()

    print 'DONE : {} (num of images :{})'.format(fuel_hdf5_path, num_images)
コード例 #57
0
ファイル: hdf5.py プロジェクト: grappli/pm1
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N):

    hdf5name = 'mushrooms.hdf5'
    f = h5py.File(hdf5name, mode='w')

    fx = f.create_dataset('x', np_enc_data.shape, dtype='float32')
    fy = f.create_dataset('y', np_enc_y.shape, dtype='int64')

    fx[...] = np_enc_data
    fy[...] = np_enc_y

    split_dict = {
        'train': {'x': (0,splitpoint), 'y': (0, splitpoint)},
        'test': {'x': (splitpoint, N), 'y': (splitpoint, N)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
コード例 #58
0
ファイル: test_hdf5.py プロジェクト: kastnerkyle/fuel
def test_h5py_dataset_out_of_memory():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(
            path='tmp.hdf5', which_set='train', load_in_memory=False)
        handle = dataset.open()
        assert_equal(
            dataset.get_data(state=handle, request=slice(0, 10))[0],
            numpy.arange(50).reshape((10, 5)))
        dataset.close(handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #59
0
ファイル: scan2hdf5.py プロジェクト: aaalgo/nnexp
def save_hd5py (out_path, data, folds = 0):
    images = np.concatenate([a[0] for a in data], axis = 0)
    labels = np.concatenate([a[1] for a in data], axis = 0)
    f = h5py.File(out_path, mode='w')
    ds = f.create_dataset('images', images.shape, dtype=str(images.dtype))
    ds[...] = images
    ds = f.create_dataset('labels', labels.shape, dtype=str(labels.dtype))
    ds[...] = labels
    #assert(folds > 1)
    #if folds > 1:
    fold = len(images) // folds
    idx = {'fold-{}'.format(i): (i*fold, (i+1)*fold) for i in range(folds)}
    print idx
    split_dict = {k: {'images': v, 'labels':v} for k, v in idx.iteritems()}
    from fuel.datasets.hdf5 import H5PYDataset
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    pass