コード例 #1
0
 def test_multiple_split_out_of_memory_list_request(self):
     dataset = H5PYDataset(self.h5file, which_sets=('train', 'test'),
                           load_in_memory=False)
     handle = dataset.open()
     assert_equal(dataset.get_data(handle, list(range(30))),
                  (self.features[:30], self.targets[:30]))
     dataset.close(handle)
コード例 #2
0
def load_ucf101_stream(batch_size,
                       train_size=16500,
                       validation_size=500,
                       test_size=100,
                       shuffle=False):
    fuel_root = fuel.config.data_path[0]
    # データセットファイル保存場所
    hdf5_filepath = os.path.join(fuel_root,
                                 'UCF101\hdf5_dataset\hdf5_dataset.hdf5')
    valid_size = train_size + validation_size
    test_size = valid_size + test_size
    indices_train = range(0, train_size)
    indices_valid = range(train_size, valid_size)
    indices_test = range(valid_size, test_size)

    h5py_file = h5py.File(hdf5_filepath)
    dataset = H5PYDataset(h5py_file, ['train'])

    scheme_class = ShuffledScheme if shuffle else SequentialScheme
    scheme_train = scheme_class(indices_train, batch_size=batch_size)
    scheme_valid = scheme_class(indices_valid, batch_size=batch_size)
    scheme_test = scheme_class(indices_test, batch_size=batch_size)

    stream_train = DataStream(dataset, iteration_scheme=scheme_train)
    stream_valid = DataStream(dataset, iteration_scheme=scheme_valid)
    stream_test = DataStream(dataset, iteration_scheme=scheme_test)
    stream_train.get_epoch_iterator().next()
    stream_valid.get_epoch_iterator().next()
    stream_test.get_epoch_iterator().next()

    return stream_train, stream_valid, stream_test
コード例 #3
0
ファイル: test_hdf5.py プロジェクト: zhoujian1210/fuel
 def test_out_of_memory_example_scheme(self):
     dataset = H5PYDataset(self.h5file,
                           which_sets=('train', ),
                           load_in_memory=False)
     iter_ = dataset.get_example_stream().get_epoch_iterator()
     assert_equal(next(iter_), (self.features[0], self.targets[0]))
     assert_equal(next(iter_), (self.features[1], self.targets[1]))
コード例 #4
0
ファイル: bw_rnn.py プロジェクト: ParsonsZeng/BGAN-1
def load_stream(batch_size=64, source=None):
    if source is None:
        raise ValueError('Source not set.')
    train_data = H5PYDataset(source, which_sets=('train',))
    train_scheme = ShuffledScheme(examples=train_data.num_examples, batch_size=batch_size)
    train_stream = OneHotEncoding(DataStream(train_data, iteration_scheme=train_scheme), N_WORDS)
    return train_stream, train_data.num_examples
コード例 #5
0
def generate():
    generator = get_model(get_config("model-file"))
    data_path = get_config("data-file")
    shape = get_config("img-shape")
    feature_dim = get_config("feature-dim") or 40
    attribute = read_feature(get_config("feature-file"))
    r, c = 5, 5

    train_set = H5PYDataset(data_path, which_sets=('train', ))
    handle = train_set.open()
    _, real_features = train_set.get_data(handle, slice(0, 50000))

    with open("feature.txt", "r") as fin:
        feature = fin.readline().strip().split(",")
    feature = choose_feature(real_features, feature)
    feature = np.array(np.repeat([feature], r * c, axis=0), dtype='float64')
    # key = feature_map["Young"]
    # for i in range(10):
    #     feature[i][key-1] = 0.1 * i

    show_feature(feature[0])
    noise = get_noise(r * c)
    imgs = generator.predict([noise, feature])
    imgs = [convert_to_img(img) for img in imgs]

    figure = fill_figure(r, c, shape, imgs)
    save_img("result", figure)
コード例 #6
0
ファイル: test_hdf5.py プロジェクト: zhoujian1210/fuel
 def test_index_split_in_memory(self):
     features = numpy.arange(50, dtype='uint8').reshape((10, 5))
     h5file = h5py.File('index_split.hdf5',
                        mode='w',
                        driver='core',
                        backing_store=False)
     h5file['features'] = features
     h5file['features'].dims[0].label = 'batch'
     h5file['features'].dims[1].label = 'feature'
     h5file['train_features_subset'] = numpy.arange(0, 10, 2)
     h5file['test_features_subset'] = numpy.arange(1, 10, 2)
     train_ref = h5file['train_features_subset'].ref
     test_ref = h5file['test_features_subset'].ref
     split_dict = {
         'train': {
             'features': (-1, -1, train_ref, '.')
         },
         'test': {
             'features': (-1, -1, test_ref, '')
         }
     }
     h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     dataset = H5PYDataset(h5file,
                           which_sets=('train', ),
                           load_in_memory=True)
     handle = dataset.open()
     request = slice(0, 5)
     assert_equal(dataset.get_data(handle, request)[0], features[0:10:2])
     assert_equal(dataset.num_examples, 5)
     dataset.close(handle)
コード例 #7
0
ファイル: load.py プロジェクト: zhangqianhui/iGAN
def load_imgs_raw(ntrain=None, ntest=None, data_dir=None):
    t = time()
    print("LOADING DATASET...")
    path = os.path.join(data_dir)
    tr_data = H5PYDataset(path, which_sets=('train', ))
    te_data = H5PYDataset(path, which_sets=('test', ))

    if ntrain is None:
        ntrain = tr_data.num_examples
    if ntest is None:
        ntest = te_data.num_examples

    print('name = %s, ntrain = %d, ntest = %d\n' % (data_dir, ntrain, ntest))
    print '%.2f seconds to load data' % (time() - t)

    return tr_data, te_data, ntrain, ntest
コード例 #8
0
ファイル: test_hdf5.py プロジェクト: jeanmarcosdarosa/fuel
def test_h5py_dataset_out_of_memory():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        targets = h5file.create_dataset('targets', (10, 1), dtype='float32')
        targets[...] = numpy.arange(10, dtype='float32').reshape((10, 1))
        split_dict = {
            'train': {
                'features': (0, 5),
                'targets': (0, 5)
            },
            'test': {
                'features': (5, 10),
                'targets': (5, 10)
            }
        }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5',
                              which_set='test',
                              load_in_memory=False)
        handle = dataset.open()
        assert_equal(
            dataset.get_data(state=handle, request=slice(3, 5))[1],
            numpy.arange(10).reshape((10, 1))[8:10])
    finally:
        dataset.close(handle)
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
コード例 #9
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_out_of_memory(self):
     dataset = H5PYDataset(
         self.h5file, which_set='test', load_in_memory=False)
     handle = dataset.open()
     assert_equal(dataset.get_data(handle, slice(3, 5)),
                  (self.features[23:25], self.targets[23:25]))
     dataset.close(handle)
コード例 #10
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_out_of_memory_unsorted_indices(self):
     dataset = H5PYDataset(
         self.h5file, which_set='train', load_in_memory=False,
         sort_indices=False)
     handle = dataset.open()
     assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5])
     dataset.close(handle)
コード例 #11
0
ファイル: fuel_helper.py プロジェクト: chiragmajithia/plat
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        datastream.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    train_stream = DataStream.default_stream(
        dataset=datastream,
        iteration_scheme=SequentialExampleScheme(datastream.num_examples))

    it = train_stream.get_epoch_iterator()
    return it
コード例 #12
0
ファイル: fuel_utils.py プロジェクト: gunkisu/asr
def get_datastream(path,
                   which_set,
                   batch_size=1,
                   norm_path=None,
                   use_ivectors=False,
                   truncate_ivectors=False,
                   ivector_dim=100,
                   shuffled=True):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    if shuffled:
        iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                         examples=wsj_dataset.num_examples)
    else:
        iterator_scheme = SequentialScheme(batch_size=batch_size,
                                           examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    if norm_path:
        data_mean_std = numpy.load(norm_path)
        base_stream = Normalize(data_stream=base_stream,
                                means=data_mean_std['mean'],
                                stds=data_mean_std['std'])

    if use_ivectors:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'ivectors', 'targets'])
        if truncate_ivectors:
            fs = TruncateTransformer(fs, 'ivectors', ivector_dim)
        # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features')
    else:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'targets'])
    return Padding(fs)
コード例 #13
0
 def test_multiple_split_in_memory(self):
     dataset = H5PYDataset(self.h5file, which_sets=('train', 'test'),
                           load_in_memory=True)
     handle = dataset.open()
     assert_equal(dataset.get_data(handle, slice(0, 30)),
                  (self.features[:30], self.targets[:30]))
     dataset.close(handle)
コード例 #14
0
def get_uttid_stream(path, which_set='test_eval92', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['uttids'])
    return fs
コード例 #15
0
ファイル: load_data.py プロジェクト: mnabihali/kalpy
def load_swbd_dataset(setname, ratio=1.0, min_count=3):
    logger.info('loading data from %s' % setname)
    logger.info('Ratio of labeled / unlabaled: %f' % ratio)
    assert ratio > 0 or ratio <= 1
    filename = '/home/tawara/work/ttic/data/icassp15.0/' + setname + '.hdf5'
    swbd = H5PYDataset(filename, which_set=("train"), load_in_memory=True)
    x_train = swbd.data_sources[0].astype(numpy.float32)
    y_train = swbd.data_sources[1][:, 0].astype(numpy.int32)
    ndata = x_train.shape[0]
    if ratio == 1.0:
        logger.info("Using all data: %d" % ndata)
        i_labeled = numpy.array(range(0, ndata))
    else:
        n_classes = y_train.max() + 1
        min_samples = sys.maxint
        max_samples = 0
        avg_samples = 0
        indices = numpy.array(range(0, ndata))
        i_labeled = []
        n_labeled_all = 0
        for c in range(n_classes):
            n_samples = round(sum(y_train == c) * ratio) if round(
                sum(y_train == c) * ratio) >= min_count else min_count
            i = (indices[y_train == c])[:n_samples]
            i_labeled += list(i)
            avg_samples += n_samples
            min_samples = n_samples if n_samples < min_samples else min_samples
            max_samples = n_samples if n_samples > max_samples else max_samples
            n_labeled_all += n_samples
        logger.info('Minimum number of labeled samples: %d' % min_samples)
        logger.info('Maximum number of labeled samples: %d' % max_samples)
        logger.info('Average number of labeled samples: %d' %
                    int(avg_samples / n_classes))
        logger.info('Total number of labeled samples:   %d' % n_labeled_all)
        logger.info('Total number of unlabeled samples: %d' %
                    int(ndata - n_labeled_all))

    swbd = H5PYDataset(filename, which_set=("dev"), load_in_memory=True)
    x_valid = swbd.data_sources[0].astype(numpy.float32)
    y_valid = swbd.data_sources[1][:, 0].astype(numpy.int32)
    swbd = H5PYDataset(filename, which_set=("test"), load_in_memory=True)
    x_test = swbd.data_sources[0].astype(numpy.float32)
    y_test = swbd.data_sources[1][:, 0].astype(numpy.int32)

    return [(x_train, y_train), (x_valid, y_valid), (x_test, y_test),
            i_labeled]
コード例 #16
0
 def test_index_subset_sorted(self):
     dataset = H5PYDataset(
         self.h5file, which_sets=('train',), subset=[0, 2, 4])
     handle = dataset.open()
     request = slice(0, 3)
     assert_equal(dataset.get_data(handle, request),
                  (self.features[[0, 2, 4]], self.targets[[0, 2, 4]]))
     dataset.close(handle)
コード例 #17
0
def load_stream(batch_size=None, source=None):
    logger.info('Loading data from `{}`'.format(source))

    train_data = H5PYDataset(source, which_sets=('train', ))
    test_data = H5PYDataset(source, which_sets=('test', ))

    num_train = train_data.num_examples
    num_test = test_data.num_examples

    logger.debug('Number of test examples: {}'.format(num_test))
    logger.debug('Number of training examples: {}'.format(num_train))

    train_scheme = ShuffledScheme(examples=num_train, batch_size=batch_size)
    train_stream = DataStream(train_data, iteration_scheme=train_scheme)
    test_scheme = ShuffledScheme(examples=num_test, batch_size=batch_size)
    test_stream = DataStream(test_data, iteration_scheme=test_scheme)
    return train_stream, num_train
コード例 #18
0
ファイル: data.py プロジェクト: ccienfall/Face-Verification
def data():

    try:
        hf["target"].shape
    except:
        hf = h5py.File('faces.hdf5', 'r+')
    num_samples = hf["input"].shape[0]

    print "number of samples in dataset : %i" % num_samples

    split_dict = {
        'train': {
            'input': (2000, num_samples),
            'target': (2000, num_samples)
        },
        'test': {
            'input': (0, 1000),
            'target': (0, 1000)
        },
        'val': {
            'input': (1000, 2000),
            'target': (1000, 2000)
        }
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    train_set = H5PYDataset('faces.hdf5', which_sets=('train', ))
    test_set = H5PYDataset('faces.hdf5', which_sets=('test', ))
    val_set = H5PYDataset('faces.hdf5', which_sets=('val', ))

    batch_size = 128

    #TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

    tr_scheme = SequentialScheme(examples=train_set.num_examples,
                                 batch_size=batch_size)
    tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

    val_scheme = SequentialScheme(examples=val_set.num_examples,
                                  batch_size=batch_size)
    val_stream = DataStream(val_set, iteration_scheme=val_scheme)

    test_scheme = SequentialScheme(examples=test_set.num_examples,
                                   batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)
    hf.close()
    return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
コード例 #19
0
ファイル: fuel_utils.py プロジェクト: gunkisu/asr
def get_spkid_stream(path, which_set, batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples,
                                       batch_size=batch_size)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['spks'])
    return fs
コード例 #20
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_in_memory(self):
     dataset = H5PYDataset(
         self.h5file, which_set='train', load_in_memory=True)
     handle = dataset.open()
     request = slice(0, 10)
     assert_equal(dataset.get_data(handle, request),
                  (self.features[request], self.targets[request]))
     dataset.close(handle)
コード例 #21
0
 def test_subset_step_gt_1(self):
     dataset = H5PYDataset(self.h5file,
                           which_sets=('train', ),
                           subset=slice(0, 10, 2))
     handle = dataset.open()
     assert_equal(
         dataset.get_data(handle, [0, 1, 2, 3, 4]),
         (self.features[slice(0, 10, 2)], self.targets[slice(0, 10, 2)]))
     dataset.close(handle)
コード例 #22
0
 def test_index_subset_unsorted(self):
     # A subset should have the same ordering no matter how you specify it.
     dataset = H5PYDataset(
         self.h5file, which_sets=('train',), subset=[0, 4, 2])
     handle = dataset.open()
     request = slice(0, 3)
     assert_equal(dataset.get_data(handle, request),
                  (self.features[[0, 2, 4]], self.targets[[0, 2, 4]]))
     dataset.close(handle)
コード例 #23
0
 def test_vlen_in_memory_example_scheme(self):
     dataset = H5PYDataset(
         self.vlen_h5file, which_sets=('train',), load_in_memory=True,
         sort_indices=False)
     iter_ = dataset.get_example_stream().get_epoch_iterator()
     assert_equal(next(iter_),
                  (self.vlen_features[0], self.vlen_targets[0]))
     assert_equal(next(iter_),
                  (self.vlen_features[1], self.vlen_targets[1]))
コード例 #24
0
ファイル: test_hdf5.py プロジェクト: julianser/fuel
 def test_out_of_memory_sorted_indices(self):
     dataset = H5PYDataset(
         self.h5file, which_set='train', load_in_memory=False,
         sort_indices=True)
     handle = dataset.open()
     request = [7, 4, 6, 2, 5]
     assert_equal(dataset.get_data(handle, request),
                  (self.features[request], self.targets[request]))
     dataset.close(handle)
コード例 #25
0
ファイル: test_hdf5.py プロジェクト: zhoujian1210/fuel
 def test_dataset_get_data_without_open(self):
     dataset = H5PYDataset(self.h5file,
                           which_sets=('train', ),
                           load_in_memory=False)
     try:
         dataset.get_data(request=(slice(0, 2)))
     except IOError:
         assert False
     dataset.close(None)
コード例 #26
0
def get_datastream(path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
コード例 #27
0
 def __init__(self,
              path,
              which_set='train',
              load_size=None,
              crop_size=None,
              dtype=numpy.float32):
     self._dtype = dtype
     self._load_size = load_size
     self._crop_size = crop_size
     self._data_set = H5PYDataset(path, which_sets=(which_set, ))
コード例 #28
0
def load_dataset(data_file, load_in_memory=False):
    """
    See ANNMINERvA/fuel_up_convdata.py for an HDF5 builder that sets up an
    appropriate data file.
    """
    if os.path.exists(data_file):
        train_set = H5PYDataset(data_file,
                                which_sets=('train', ),
                                load_in_memory=load_in_memory)
        valid_set = H5PYDataset(data_file,
                                which_sets=('valid', ),
                                load_in_memory=load_in_memory)
        test_set = H5PYDataset(data_file,
                               which_sets=('test', ),
                               load_in_memory=load_in_memory)
    else:
        raise Exception('Data file', data_file, 'not found!')

    return train_set, valid_set, test_set
コード例 #29
0
ファイル: model.py プロジェクト: rchen19/StreetNumber
def train1(model=None):
    if model is not None:
        trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets'))
        testset = H5PYDataset('svhn_format_2.hdf5', which_sets=('test',), sources=('features', 'targets'))
        batch_size = 500
        epochs_to_wait_for_improve = 1
        csv_logger = keras.callbacks.CSVLogger('traininglog.csv')
        check_point = keras.callbacks.ModelCheckpoint("model3epochweights.h5", monitor='val_loss', 
                                                    verbose=0, save_best_only=False, 
                                                    save_weights_only=True, mode='auto', period=1)
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve)
        history = model.fit_generator(dataset_generator(trainset, batch_size),
                                        steps_per_epoch=np.ceil(trainset.num_examples/batch_size), 
                                        epochs=15, verbose=2,
                                        callbacks=[csv_logger, check_point, early_stopping],
                                        validation_data=dataset_generator(testset, batch_size),
                                        validation_steps=np.ceil(testset.num_examples/batch_size))
        #print accuracy
        return history
コード例 #30
0
 def __init__(self, bs, sources, filename, which_sets, **kwargs):
     self.bs = bs
     self.provides_sources = sources
     self.filename = filename
     super(UniformDataset, self).__init__(**kwargs)
     self.train_set = H5PYDataset(self.filename,
                                  which_sets=which_sets,
                                  load_in_memory=True)
     self.handle = self.train_set.open()
     self.num_examples = self.train_set.num_examples