Example #1
0
    def setUp(self):
        self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36))
        self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
        h5file = h5py.File(
            'file.hdf5', mode='w', driver='core', backing_store=False)
        h5file['features'] = self.features
        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'feature'
        h5file['targets'] = self.targets
        h5file['targets'].dims[0].label = 'batch'
        h5file['targets'].dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)},
                      'test': {'features': (20, 30), 'targets': (20, 30)},
                      'unlabeled': {'features': (30, 100, None, '.')}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.h5file = h5file

        vlen_h5file = h5py.File(
            'test_vl.hdf5', mode='w', driver='core', backing_store=False)
        self.vlen_features = [
            numpy.arange(12, dtype='uint8').reshape((3, 2, 2)),
            numpy.arange(48, dtype='uint8').reshape((3, 4, 4)),
            numpy.arange(60, dtype='uint8').reshape((3, 5, 4)),
            numpy.arange(18, dtype='uint8').reshape((3, 2, 3))]
        self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1))
        dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
        features = vlen_h5file.create_dataset('features', (4,), dtype=dtype)
        features[...] = [d.flatten() for d in self.vlen_features]
        features.dims[0].label = 'batch'
        features_shapes = vlen_h5file.create_dataset(
            'features_shapes', (4, 3), dtype='uint8')
        features_shapes[...] = numpy.array(
            [d.shape for d in self.vlen_features])
        features.dims.create_scale(features_shapes, 'shapes')
        features.dims[0].attach_scale(features_shapes)
        features_shape_labels = vlen_h5file.create_dataset(
            'features_shape_labels', (3,), dtype='S7')
        features_shape_labels[...] = [
            'channel'.encode('utf8'), 'height'.encode('utf8'),
            'width'.encode('utf8')]
        features.dims.create_scale(features_shape_labels, 'shape_labels')
        features.dims[0].attach_scale(features_shape_labels)
        targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8')
        targets[...] = self.vlen_targets
        targets.dims[0].label = 'batch'
        targets.dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}}
        vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.vlen_h5file = vlen_h5file
Example #2
0
def createH5Dataset(hdf5_out, corpus_path, sequence_length):
    with open(corpus_path) as f:
        corpus = f.read().split(",")

    (indices, vocab) = pd.factorize(list(corpus))

    instances_num = len(corpus) // (sequence_length + 1)

    f = h5py.File(hdf5_out, mode='w')

    train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8)
    train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8)

    for j in range(instances_num):
        for i in range(sequence_length):
            train_data_x[j][i] = indices[i + j * (sequence_length + 1)]
            train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1]

    char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8')
    char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8')

    char_in[...] = train_data_x
    char_out[...] = train_data_y

    split_dict = {
        'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.attrs["vocab"] = json.dumps(list(vocab))

    f.flush()
    f.close()
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size):
    [rate, signal] = wav.read(wav_name)
    num_steps   = signal.shape[0]
    num_seqs    = num_steps-window_size
    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)

    signal = signal.reshape(num_steps,1)
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16')
        print ' num of sequences : {}'.format(num_seqs)
        for s in xrange(num_seqs):
            input_feature[s]  = signal[s:s+window_size]
            target_feature[s] = signal[(s+1):(s+1)+window_size]

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        split_dict = {'train': {'input_feature' : ( 0,  num_seqs),
                                'target_feature': ( 0,  num_seqs)}}

        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_seqs
Example #4
0
    def save_h5py(tn, start, stop):
        cf = train_features[start:stop]
        ct = train_targets[start:stop]
        np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf)
        np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct)
        h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w')
        h5_features = h5.create_dataset(
            'features', (cf.shape[0], cf.shape[1]*mult)
            , dtype='float32')
        lenf = stop - start
        with ProgressBar(maxval=lenf) as progbar:
            for i in range(lenf):
                arr = []
                for j in range(-concat[0], concat[0]+1, concat[1]):
                    arr.extend(cf[(i-j)%lenf])
                h5_features[i] = np.asarray(arr)
                progbar.update(i)

        h5_targets = h5.create_dataset(
            'targets', ct.shape, dtype='uint16')
        h5_targets[...] = ct
        h5_features.dims[0].label = 'batch'
        h5_features.dims[1].label = 'feature'
        h5_targets.dims[0].label = 'batch'
        h5_targets.dims[1].label = 'index'

        split_dict = {
            tn: {'features': (0, stop-start), 'targets': (0, stop-start)},
            #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))},
        }
        h5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5.flush()
        h5.close()
def CreateHDF5():
    sizes = numpy.random.randint(3,9, size=(100,))
    train_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[:90]]
    test_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[90:]]

    train_vector_features = numpy.random.normal(size=(90,10)).astype('float32')
    test_vector_features = numpy.random.normal(size=(10,10)).astype('float32')
    train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8')
    test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8')

    f = h5py.File('dataset.hdf5', mode='w')
    vector_features = f.create_dataset(
         'vector_features', (100, 10), dtype='float32')
    targets = f.create_dataset(
         'targets', (100, 1), dtype='uint8')

    vector_features[...] = numpy.vstack(
         [train_vector_features, test_vector_features])
    targets[...] = numpy.vstack([train_targets, test_targets])


    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    all_image_features = train_image_features + test_image_features
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    image_features = f.create_dataset('image_features', (100,), dtype=dtype)
    image_features[...] = [image.flatten() for image in all_image_features]
    image_features.dims[0].label='batch'

    image_features_shapes = f.create_dataset(
         'image_features_shapes', (100, 3), dtype='int32')
    image_features_shapes[...] = numpy.array(
         [image.shape for image in all_image_features])
    image_features.dims.create_scale(image_features_shapes, 'shapes')
    image_features.dims[0].attach_scale(image_features_shapes)

    image_features_shape_labels = f.create_dataset(
         'image_features_shape_labels', (3,), dtype='S7')
    image_features_shape_labels[...] = [
         'channel'.encode('utf8'), 'height'.encode('utf8'),
         'width'.encode('utf8')]
    image_features.dims.create_scale(
         image_features_shape_labels, 'shape_labels')
    image_features.dims[0].attach_scale(image_features_shape_labels)

    split_dict = {
         'train': {'vector_features': (0, 90), 'image_features': (0, 90),
                   'targets': (0, 90)},
         'test': {'vector_features': (90, 100), 'image_features': (90, 100),
                  'targets': (90, 100)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
Example #6
0
def test_h5py_dataset_split():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 8)},
                      'test': {'features': (8, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
        test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
        train_handle = train_set.open()
        test_handle = test_set.open()
        assert_equal(
            train_set.get_data(state=train_handle, request=slice(0, 8))[0],
            numpy.arange(50).reshape((10, 5))[:8])
        assert_equal(
            test_set.get_data(state=test_handle, request=slice(0, 2))[0],
            numpy.arange(50).reshape((10, 5))[8:])
        train_set.close(train_handle)
        test_set.close(test_handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
def add_sets(args):
    with h5py.File(args.h5file, "a") as h5file:
        sources = []
        for dataset in h5file:
            if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"):
                continue
            sources.append(dataset)

        uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])}

        split_dict = {}
        for subset in args.sets:
            name, uttids_fname = subset.split("=")
            idxs = []
            with open(uttids_fname) as uf:
                for l in uf:
                    uttid = l.strip().split()[0]
                    idxs.append(uttid2idx[uttid])

            indices_name = "{}_indices".format(name)

            if indices_name in h5file:
                del h5file[indices_name]

            #
            # Note: ideally, we would sort the indeces and do:
            # h5file[indices_name] = numpy.array(sorted(idxs))
            # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
            #
            h5file[indices_name] = numpy.array(idxs)
            indices_ref = h5file[indices_name].ref
            split_dict[name] = {source: (-1, -1, indices_ref) for source in sources}

        h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
Example #8
0
def data():

    try:
        hf["target"].shape
    except:
        hf = h5py.File('faces.hdf5','r+')
    num_samples = hf["input"].shape[0]

    print "number of samples in dataset : %i" %num_samples

    split_dict = {
         'train': {'input': (2000, num_samples), 'target': (2000, num_samples)},
         'test': {'input': (0, 1000), 'target': (0, 1000)},
         'val': {'input': (1000, 2000), 'target': (1000, 2000)}
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    train_set = H5PYDataset('faces.hdf5', which_sets=('train',))
    test_set = H5PYDataset('faces.hdf5', which_sets=('test',))
    val_set = H5PYDataset('faces.hdf5', which_sets=('val',))

    batch_size = 128

#TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

    tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size)
    tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

    val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size)
    val_stream = DataStream(val_set, iteration_scheme=val_scheme)

    test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)
    hf.close()
    return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
Example #9
0
 def save_hd5py(dataset, destfile, indices_dict):
     f = h5py.File(destfile, mode='w')
     images = f.create_dataset('images', dataset.shape, dtype='uint8')
     images[...] = dataset
     split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems())
     f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     f.flush()
     f.close()
Example #10
0
 def test_value_error_on_unequal_sources(self):
     def get_subsets():
         return H5PYDataset(self.h5file, which_sets=('train',)).subsets
     split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)},
                   'test': {'features': (20, 30), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100, None, '.')}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     assert_raises(ValueError, get_subsets)
def biblefile_to_hdf5(open_file):  # TODO REMOVE LINES WITH THE BOOK OF BLABLA
    """Everything in one function because we have variable-length sequences, so no intermediate arrays..."""
    char_to_ind = {"<S>": 0, "</S>": 1}
    current_char_ind = 2  # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character
    all_verses = []
    # TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence
    current_verse = []
    for line in open_file:
        # first we need to check if a new verse begins somewhere in the line (not just beginning...)
        verse_marker_pos = find_verse_marker(line)
        if len(line.split()) > 0 and verse_marker_pos > -1:
            # if so, save the verse up to the verse marker and start a new one from the rest of the line
            current_verse += list(line[:verse_marker_pos])
            # also replace all characters by integers, creating more mappings if necessary
            for (ind, char) in enumerate(current_verse):
                if char not in char_to_ind:
                    char_to_ind[char] = current_char_ind
                    current_char_ind += 1
                current_verse[ind] = char_to_ind[char]
            current_verse.append(1)  # for sequence generator we need to explicitly append this end-of-sequence char
            all_verses.append(numpy.array(current_verse, dtype="int32"))
            current_verse = list(line[verse_marker_pos:])
        # otherwise, just put everything into the current verse
        else:
            current_verse += list(line)
    all_verses = numpy.array(all_verses)  # I think this conversion is necessary for the indexing below?

    # at this point we have all our verses =) now we build our .hdf5 dataset
    # make a little validation set
    val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500)
    test_set = list(all_verses[val_indices])
    train_set = list(numpy.delete(all_verses, val_indices, 0))

    # if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part)
    f = h5py.File(name="bible.hdf5", mode="w")
    dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32"))
    character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int)
    character_seqs[...] = train_set + test_set

    split_dict = {"train": {"character_seqs": (0, len(train_set))},
                  "valid": {"character_seqs": (len(train_set), len(all_verses))}}
    f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    # we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file
    numpy.save("onehot_size.npy", current_char_ind)
    # also the word-to-index dict
    cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w"))
    # make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output
    # later
    ind_to_char = [""]*len(char_to_ind)
    ind_to_char[0] = "<S>"
    ind_to_char[1] = "</S>"
    for char in char_to_ind:
        ind_to_char[char_to_ind[char]] = char
    cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
Example #12
0
def build_hdf5_dataset(input_filename, output_filename,batch_size=64):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #

    #print "Sample from data: {}".format(data[70])
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    batch_index_train = int(0.9 * train_valid_length / float(batch_size))
    batch_index_valid = int(train_valid_length / float(batch_size))
    batch_index_test = int(data_length / float(batch_size))

    print "batch indices in order : {}".format((batch_index_train,
                                                batch_index_valid,
                                                batch_index_test))

    assert(train_valid_length == batch_index_valid * batch_size)

    data = data.reshape(data_length)[:batch_index_test*batch_size]
    data = data.reshape(batch_index_test,batch_size,1)
    print data.shape

    print ("values lost: {}").format(data_length - data.size)
    test_length = data_length - train_valid_length

    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    features.dims[0].label = 'batch'
    features.dims[1].label = 'time'
    features.dims[2].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0, batch_index_train)},
        'valid': {
            'features' : (batch_index_train + 1, batch_index_valid)},
        'test': {
            'features' : (batch_index_valid + 1,batch_index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
Example #13
0
def save_hd5py(dataset_dict, destfile, indices_dict):
    f = h5py.File(destfile, mode='w')
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
            for k,v in indices_dict.iteritems())
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size):
    data_stream = YouTubeAudio(youtube_id).get_example_stream()

    data_stream = Window(offset=interval_size,
                         source_window=interval_size*window_size,
                         target_window=interval_size*window_size,
                         overlapping=True,
                         data_stream=data_stream)

    data_iterator = data_stream.get_epoch_iterator()

    num_sequences = 0
    for data in data_iterator:
        num_sequences = num_sequences + 1

    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)
    print 'total num sequences : ', num_sequences
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16')

        data_iterator = data_stream.get_epoch_iterator()
        # for each batch
        for s_idx, sequence_data in enumerate(data_iterator):
            # get data
            source_data = sequence_data[0]
            target_data = sequence_data[1]

            # save data
            input_feature[s_idx]  = source_data.reshape(window_size, interval_size)
            target_feature[s_idx]  = target_data.reshape(window_size, interval_size)

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        num_trains = int(num_sequences*0.8)

        split_dict = {'train': {'input_feature' : ( 0,  num_trains),
                                'target_feature': ( 0,  num_trains)},
                      'valid': {'input_feature' : ( num_trains,  num_sequences),
                                'target_feature': ( num_trains,  num_sequences)},
                      }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_sequences
Example #15
0
def build_hdf5_dataset_single_dim(input_filename, output_filename):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    This function outputs a single dimension for the datasets.
    Adapted to monk_music
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    index_train = int(0.9 * train_valid_length)
    index_valid = int(train_valid_length)
    index_test = int(data_length)

    print "batch indices in order : {}".format((index_train,
                                                index_valid,
                                                index_test))

    data = data.reshape((data_length))

    print "Train example: {}".format(data[index_train-100:index_train])
    print "Valid example: {}".format(data[index_valid-100:index_valid])
    print "Test example: {}".format(data[index_test-100:index_test])


    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    #features.dims[0].label = 'batch'
    #features.dims[0].label = 'time'
    features.dims[0].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0,index_train)},
        'valid': {
            'features' : (index_train + 1,index_valid)},
        'test': {
            'features' : (index_valid + 1,index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
Example #16
0
 def test_pickling(self):
     try:
         features = numpy.arange(360, dtype='uint8').reshape((10, 36))
         h5file = h5py.File('file.hdf5', mode='w')
         h5file['features'] = features
         split_dict = {'train': {'features': (0, 10, '.')}}
         h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
         dataset = cPickle.loads(
             cPickle.dumps(H5PYDataset(h5file, which_set='train')))
         assert dataset.data_sources is None
     finally:
         os.remove('file.hdf5')
Example #17
0
def train_model():
    data_path = get_config("data-file")
    batchs = get_config("batchs")
    half_batch = batchs // 2
    quarter_batch = half_batch // 2
    n_batchs = (get_config("train-datasets")
                or train_set.num_examples) // batchs
    feature_dim = get_config("feature-dim") or 40
    noise_dim = get_config("noise-dim") or 10
    input_dim = feature_dim + noise_dim

    train_set = H5PYDataset(data_path, which_sets=('train', ))
    handle = train_set.open()

    generator, discriminator, gan = build_net(input_dim)
    save_result(0, generator, np.zeros(shape=(1, feature_dim)))

    for i in range(get_config("epochs")):
        for j in range(n_batchs):
            imgs, features = get_batch(train_set, handle, j * batchs)

            idx = np.random.randint(0, imgs.shape[0], half_batch)
            real_imgs = imgs[idx]
            real_features = features[idx]

            gen_features = get_features(features)[np.random.randint(
                0, imgs.shape[0], half_batch)]
            noise = np.random.normal(0, 1, (half_batch, noise_dim))
            gen_imgs = generator.predict([noise, gen_features])

            # real feature and real img
            d_loss_real = discriminator.train_on_batch(
                [real_imgs], [np.ones((half_batch, 1)), real_features])
            # fake feature and fake img
            d_loss_fake = discriminator.train_on_batch(
                [gen_imgs], [np.zeros((half_batch, 1)), gen_features])
            d_loss = np.add(d_loss_real, d_loss_fake) * 0.5

            # train Generator
            noise = np.random.normal(0, 1, (batchs, noise_dim))
            gen_features = get_features(features)
            g_loss = gan.train_on_batch([noise, gen_features],
                                        [np.ones((batchs, 1)), gen_features])

            print(
                "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] %f%%" %
                (i, d_loss[0], 100 * d_loss[1], g_loss[0], j * 100 / n_batchs))

            if i % 10 == 0 and get_config("env") == "GPU":
                save_result(i, generator, gen_features[0:1, :feature_dim])
Example #18
0
def load_all_datasubsets(data_file, slice_to_load):
    """
    Always load data in memory - get all of 'train', 'valid', and 'test'
    subsets
    """
    if os.path.exists(data_file):
        dset = H5PYDataset(data_file,
                           which_sets=('train', 'valid', 'test'),
                           subset=slice(slice_to_load[0], slice_to_load[1]),
                           load_in_memory=True)
    else:
        raise Exception('Data file', data_file, 'not found!')

    return dset
Example #19
0
def test_h5py_dataset_pickles():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        pickle.loads(pickle.dumps(dataset))
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Example #20
0
 def test_vlen_reshape_out_of_memory_unordered(self):
     dataset = H5PYDataset(self.vlen_h5file,
                           which_sets=('train', ),
                           load_in_memory=False)
     expected_features = numpy.empty((4, ), dtype=numpy.object)
     for i, j in enumerate([0, 3, 1, 2]):
         expected_features[i] = self.vlen_features[j]
     expected_targets = self.vlen_targets[[0, 3, 1, 2]]
     handle = dataset.open()
     rval = dataset.get_data(handle, [0, 3, 1, 2])
     for val, truth in zip(rval[0], expected_features):
         assert_equal(val, truth)
     assert_equal(rval[1], expected_targets)
     dataset.close(handle)
def output_hdf5(path_list, output_root_dir):
    num_data = len(path_list)
    shapes = []

    dirs = output_root_dir.split('\\')
    file_name = dirs[-1] + '.hdf5'
    output_root_dir = os.path.join(output_root_dir, file_name)

    f = h5py.File(output_root_dir, mode='w')
    dtype = h5py.special_dtype(vlen=np.dtype('uint8'))
    image_features = f.create_dataset('image_features',
                                      (num_data,),
                                      dtype=dtype)

    image_features.dims[0].label = 'batch'

    try:
        for i in tqdm.tqdm(range(num_data)):
            image = io.imread(path_list[i])
            shapes.append(image.shape)
            image_features[i] = image.flatten()

        shapes = np.array(shapes).astype(np.int32)
        image_features_shapes = f.create_dataset('image_features_shapes',
                                                 (num_data, 3),
                                                 dtype=np.int32)
        image_features_shapes[...] = shapes

        image_features.dims.create_scale(image_features_shapes, 'shapes')
        image_features.dims[0].attach_scale(image_features_shapes)

        image_features_shape_labels = f.create_dataset(
            'image_features_shape_labels', (3,), dtype='S7')
        image_features_shape_labels[...] = [
             'height'.encode('utf8'), 'width'.encode('utf8'),
             'channel'.encode('utf8')]
        image_features.dims.create_scale(
            image_features_shape_labels, 'shape_labels')
        image_features.dims[0].attach_scale(image_features_shape_labels)

        # specify the splits
        split_train = (0, num_data)
        split_dict = dict(train=dict(image_features=split_train))
        f.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    except KeyboardInterrupt:
        print "割り込み停止が実行されました"

    f.flush()
    f.close()
Example #22
0
    def _binarized_mnist_loader(self):
        examples, dataset_splits = [], []
        for split in ["train", "test", "valid"]:
            dataset = H5PYDataset(self.loader_path, which_sets=(split,))
            data_stream = dataset.get_example_stream()
            data = list(data_stream.get_epoch_iterator())
            examples += data
            dataset_splits.append(len(data))

        random.seed(self.ann_seed)
        random.shuffle(examples)  # guarantees consistency for ttv splits
        it = iter(examples)
        return (self._reshape_samples(list(islice(it, 0, i)))
                for i in dataset_splits)
Example #23
0
 def test_vlen_reshape_out_of_memory(self):
     dataset = H5PYDataset(
         self.vlen_h5file, which_sets=('train',), subset=slice(1, 3),
         load_in_memory=False)
     expected_features = numpy.empty((2,), dtype=numpy.object)
     for i, f in enumerate(self.vlen_features[1:3]):
         expected_features[i] = f
     expected_targets = self.vlen_targets[1:3]
     handle = dataset.open()
     rval = dataset.get_data(handle, slice(0, 2))
     for val, truth in zip(rval[0], expected_features):
         assert_equal(val, truth)
     assert_equal(rval[1], expected_targets)
     dataset.close(handle)
Example #24
0
def make_gen(batch_size, examples=4):
    file_path_f = file_path
    names_select = i_names
    train_set = H5PYDataset(file_path_f,
                            which_sets=('train', 'test'))

    scheme = SequentialScheme(examples=examples, batch_size=batch_size)

    data_stream_train = DataStream(dataset=train_set, iteration_scheme=scheme)

    stand_stream_train = ScaleAndShift(data_stream=data_stream_train,
                                       scale=scale, shift=shift,
                                       which_sources=(names_select[-1],))
    return stand_stream_train, train_set, data_stream_train
Example #25
0
def fuel_converter(fuel_dataset, embeddings_train, labels_train,
                   embeddings_test, labels_test):
    f = h5py.File(fuel_dataset, mode='w')

    labels_train = np.expand_dims(labels_train, axis=1)
    labels_test = np.expand_dims(labels_test, axis=1)

    train_sz = embeddings_train.shape[0] - embeddings_train.shape[0] % 100
    test_sz = embeddings_test.shape[0] - embeddings_test.shape[0] % 100
    feat_sz = embeddings_train.shape[1]
    dataset_sz = train_sz + test_sz

    print("Actual Train size : ", embeddings_train.shape[0])
    print("Train size in Fuel : ", train_sz)

    print("Actual Test size : ", embeddings_test.shape[0])
    print("Test size in Fuel : ", test_sz)

    vector_features = f.create_dataset('features', (dataset_sz, feat_sz),
                                       dtype='float64')
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    ## put the data loaded into these objects
    vector_features[...] = np.vstack(
        [embeddings_train[0:train_sz], embeddings_test[0:test_sz]])
    targets[...] = np.vstack(
        [labels_train[0:train_sz], labels_test[0:test_sz]])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {
            'features': (0, train_sz),
            'targets': (0, train_sz)
        },
        'test': {
            'features': (train_sz, dataset_sz),
            'targets': (train_sz, dataset_sz)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
Example #26
0
def prepare_data(conf):
    """
    Extract strided crops from a set of images and assemble into a 2D matrix.
    Save into an HDF5 file.

    Args:
      conf: dictionary containing data parameters
    Returns:
      tr_stream: DataStream for training set
      te_stream: DataStream for testing set
    """
    preproc.store_hdf5(conf)  #, compression='lzf')

    path_h5 = conf['path_h5']

    tr_set = H5PYDataset(path_h5, ('train', ),
                         sources=('LR', 'HR'),
                         load_in_memory=conf['load_in_memory'])
    tr_scheme = ShuffledScheme(examples=tr_set.num_examples,
                               batch_size=FLAGS.num_gpus * conf['mb_size'])
    tr_stream = DataStream(dataset=tr_set, iteration_scheme=tr_scheme)

    te_set = H5PYDataset(path_h5, ('test', ),
                         sources=('LR', 'HR'),
                         load_in_memory=conf['load_in_memory'])
    te_scheme = SequentialScheme(examples=te_set.num_examples,
                                 batch_size=FLAGS.num_gpus * conf['mb_size'])
    te_stream = DataStream(dataset=te_set, iteration_scheme=te_scheme)

    if conf['load_in_memory']:
        print("training set: %d mb" % ((tr_set.data_sources[0].nbytes + \
            tr_set.data_sources[1].nbytes) / 1e6))
        print("testing set: %d mb" % ((te_set.data_sources[0].nbytes + \
            te_set.data_sources[1].nbytes) / 1e6))
        time.sleep(2)

    return tr_stream, te_stream
Example #27
0
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    data_mean_std = numpy.load(norm_path)

    iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                     examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    base_stream = Normalize(data_stream=base_stream,
                            means=data_mean_std['mean'],
                            stds=data_mean_std['std'])
    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
Example #28
0
def test_h5py_flatten_out_of_memory():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 2, 3),
                                         dtype='float32')
        features[...] = numpy.arange(60, dtype='float32').reshape((10, 2, 3))
        targets = h5file.create_dataset('targets', (10, ), dtype='uint8')
        targets[...] = numpy.arange(10, dtype='uint8')
        split_dict = {'train': {'features': (0, 10), 'targets': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5',
                              load_in_memory=False,
                              which_set='train',
                              flatten=['features'])
        handle = dataset.open()
        assert_equal(
            dataset.get_data(state=handle, request=slice(0, 10))[0],
            numpy.arange(60).reshape((10, 6)))
        dataset.close(handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Example #29
0
def train2(model=None, num_epochs=1, epoch_weights="modelepochweights.h5", \
            weights="modelweights.h5", model_save="model.json",\
            log_save="modeltraininglog.csv"):
    if model is not None:
        dataset_size = 73257# + 531131 #this includes train (73257) and extra (531131)
        #use 20% as validation
        validation_size = int(0.2*dataset_size)
        train_size = dataset_size - validation_size
        #sequence of 1s and 0s for splitting dataset
        seq = np.hstack((np.zeros(validation_size),np.ones(train_size)))
        #randomize
        np.random.seed(1234)
        np.random.shuffle(seq)
        train_idx = np.where(seq==1)[0].tolist()
        validation_idx = np.where(seq==0)[0].tolist()

        trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), 
                                sources=('features', 'targets'), subset=train_idx)
        validationset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), 
                                sources=('features', 'targets'), subset=validation_idx)
        batch_size = 500
        epochs_to_wait_for_improve = 15
        csv_logger = keras.callbacks.CSVLogger(log_save)
        check_point = keras.callbacks.ModelCheckpoint(epoch_weights, monitor='val_loss', 
                                                        verbose=0, save_best_only=True, 
                                                        save_weights_only=True, mode='auto', period=1)
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve)
        history = model.fit_generator(dataset_generator(trainset, batch_size),
                                        steps_per_epoch=np.ceil(trainset.num_examples/batch_size), 
                                        epochs=num_epochs, verbose=2,
                                        callbacks=[csv_logger, check_point, early_stopping],
                                        validation_data=dataset_generator(validationset, batch_size),
                                        validation_steps=np.ceil(validationset.num_examples/batch_size))
        save_model(model, weights, model_save)
        #print accuracy
        return history
Example #30
0
 def setUp(self):
     self.features = numpy.arange(3600, dtype='uint8').reshape((100, 36))
     self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
     self.h5file = h5py.File(
         'file.hdf5', mode='w', driver='core', backing_store=False)
     self.h5file['features'] = self.features
     self.h5file['features'].dims[0].label = 'batch'
     self.h5file['features'].dims[1].label = 'feature'
     self.h5file['targets'] = self.targets
     self.h5file['targets'].dims[0].label = 'batch'
     self.h5file['targets'].dims[1].label = 'index'
     split_dict = {'train': {'features': (0, 20, '.'), 'targets': (0, 20)},
                   'test': {'features': (20, 30, ''), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100)}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
Example #31
0
def load_datasubset(data_file, subset, slice_to_load):
    """
    Always load data in memory
    subset = 'train', 'valid', or 'test'
    slice_to_load = a tuple (not a slice object) with start, stop event #'s
    """
    if os.path.exists(data_file):
        dset = H5PYDataset(data_file,
                           which_sets=(subset, ),
                           subset=slice(slice_to_load[0], slice_to_load[1]),
                           load_in_memory=True)
    else:
        raise Exception('Data file', data_file, 'not found!')

    return dset
Example #32
0
def emboot_converter_traintrain(emboot_dataset):
    train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np(
    )
    f = h5py.File(emboot_dataset, mode='w')

    train_sz = train_vector_features.shape[0]
    test_sz = test_vector_features.shape[0]
    feat_sz = train_vector_features.shape[1]
    dataset_sz = (
        train_sz + test_sz - 16
    ) * 2  ## NOTE: 13900 * 2 (copy over the train data to the test dataset)

    vector_features = f.create_dataset('features', (dataset_sz, feat_sz),
                                       dtype='float64')  ## train + test
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    ## put the data loaded into these objects

    train_vector_features_aug = np.vstack(
        [train_vector_features, test_vector_features])[:13900]
    train_targets_aug = np.vstack([train_targets, test_targets])[:13900]

    vector_features[...] = np.vstack(
        [train_vector_features_aug, train_vector_features_aug])
    targets[...] = np.vstack([train_targets_aug, train_targets_aug])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {
            'features': (0, dataset_sz / 2),
            'targets': (0, dataset_sz / 2)
        },
        'test': {
            'features': (dataset_sz / 2, dataset_sz),
            'targets': (dataset_sz / 2, dataset_sz)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
Example #33
0
def add_phonemes():
    data_path = os.environ["FUEL_DATA_PATH"]
    data_path = os.path.join(data_path, "blizzard/")
    save_name = "sp_blizzard_80h_phon.hdf5"
    phon_file = "tbptt_blizzard_80h.hdf5"
    data_file = "sp_blizzard_80h.hdf5"

    save_path = os.path.join(data_path, save_name)
    phon_path = os.path.join(data_path, phon_file)
    data_path = os.path.join(data_path, data_file)

    resulth5 = h5py.File(save_path, mode="w")
    phonh5 = h5py.File(phon_path, mode="r")
    datah5 = h5py.File(data_path, mode="r")

    sp_h5 = resulth5.create_dataset("sp", (TOTAL_ROWS, 512, 257), dtype="float32")
    f0_h5 = resulth5.create_dataset("f0", (TOTAL_ROWS, 512), dtype="float32")

    phon_h5 = resulth5.create_dataset("phonemes", (TOTAL_ROWS, 512), dtype="int16")

    f0_h5[:] = datah5["f0"][:]
    phon_h5[:] = phonh5["phonemes"][:, ::64]

    n_times = 100
    idx = chunkIt(range(TOTAL_ROWS), n_times)

    for num_indx, indx in enumerate(idx):
        print num_indx, 100
        sp_h5[indx] = datah5["sp"][indx]

    cont = TOTAL_ROWS
    end_train = int(0.9 * cont)
    end_valid = int(0.95 * cont)
    end_test = cont

    split_dict = {
        "train": {"sp": (0, end_train), "f0": (0, end_train), "phonemes": (0, end_train)},
        "valid": {"sp": (end_train, end_valid), "f0": (end_train, end_valid), "phonemes": (end_train, end_valid)},
        "test": {"sp": (end_valid, end_test), "f0": (end_valid, end_test), "phonemes": (end_valid, end_test)},
    }

    resulth5.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    resulth5.flush()
    resulth5.close()

    phonh5.close()
    datah5.close()
Example #34
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True,
                         label_transforms=False,
                         return_length=False):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        h5_dataset.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    datastream = DataStream.default_stream(
        dataset=h5_dataset,
        iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples))

    if label_transforms:
        # TODO: maybe refactor this common bit with get_custom_streams below
        datastream = AddLabelUncertainty(datastream,
                                         chance=0,
                                         which_sources=('targets', ))

        datastream = RandomLabelStrip(datastream,
                                      chance=0,
                                      which_sources=('targets', ))

        # HACK: allow variable stretch
        datastream = StretchLabels(datastream,
                                   length=128,
                                   which_sources=('targets', ))

    it = datastream.get_epoch_iterator()
    if return_length:
        return it, h5_dataset.num_examples
    else:
        return it
Example #35
0
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds):
    indices_dict = indices_dict_or_numfolds
    if isinstance(indices_dict, int):
        folds = indices_dict
        n = max(len(it) for it in dataset_dict.values())
        fold_n = n // folds
        indices_dict = dict(("fold_{}".format(i), (i * fold_n, (i + 1) * fold_n)) for i in range(folds))
        print indices_dict
    f = h5py.File(destfile, mode="w")
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys())) for k, v in indices_dict.iteritems())
    f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
Example #36
0
def test_h5py_dataset_axis_labels():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features.dims[0].label = 'batch'
        features.dims[1].label = 'feature'
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        assert dataset.axis_labels == {'features': ('batch', 'feature')}
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Example #37
0
def train(model=None):
    if model is not None:
        trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets'))
        trainstream = DataStream(trainset, iteration_scheme=SequentialScheme(examples=trainset.num_examples, batch_size=500))
        for data in trainstream.get_epoch_iterator():
            images, labels = data
            #standardize the input images
            m = images.mean(axis=(2,3), keepdims=True)
            s = images.std(axis=(2,3), keepdims=True)
            images = (images - m)/s
            #change from "channel_first" to "channel_last"
            images = np.transpose(images, (0,2,3,1))
            labels = keras.utils.to_categorical(labels)
            #print images.shape
            model.train_on_batch(x=images, y=labels)
        trainstream.close()
Example #38
0
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(batch_size=batch_size,
                                       examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors', 'targets'])

    if delay:
        fs = DelayTransformer(fs, delay)

    fs = FilterSources(data_stream=fs, sources=['features', 'ivectors'])
    return Padding(fs)
Example #39
0
 def test_pickling(self):
     try:
         features = numpy.arange(360, dtype='uint16').reshape((10, 36))
         h5file = h5py.File('file.hdf5', mode='w')
         h5file['features'] = features
         split_dict = {'train': {'features': (0, 10, None, '.')}}
         h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
         dataset = cPickle.loads(
             cPickle.dumps(H5PYDataset(h5file, which_sets=('train',))))
         # Make sure _out_of_memory_{open,close} accesses
         # external_file_handle rather than _external_file_handle
         dataset._out_of_memory_open()
         dataset._out_of_memory_close()
         assert dataset.data_sources is None
     finally:
         os.remove('file.hdf5')
Example #40
0
def text_to_h5py_dataset(text_path, dst_path):
    # The simplest is to load everything to memory first.
    # If memory becomes an issue, this code can be optimized.
    words = []
    with open(text_path, 'r') as src:
        for line in src:
            words.extend(line.strip().split())

    with h5py.File(dst_path, 'w') as dst:
        dtype = h5py.special_dtype(vlen=bytes)
        table = dst.create_dataset('words', (len(words), ), dtype=dtype)
        table[:] = words

        dst.attrs['split'] = H5PYDataset.create_split_array(
            {'train': {
                'words': (0, len(words))
            }})
Example #41
0
def test_no_aug(dataset_used, model=None, testset=('test', 'test_neg',)): #include neg samples
    if model is not None:
        #accuracies = []
        #dataset_size = H5PYDataset('new.hdf5', which_sets=('test','test_neg')).num_examples
        #seq = np.arange(dataset_size)
        #np.random.seed(1234)
        #np.random.shuffle(seq)
        #test_idx=seq.tolist()
        batch_size = 500
        #dataset_used = "new_more_neg.hdf5"
        testset = H5PYDataset(dataset_used, which_sets=testset, sources=('features', 'targets')) 
        loss, accuracy = model.evaluate_generator(dataset_generator(testset, batch_size), 
                                                    steps=np.ceil(testset.num_examples/batch_size), 
                                                    max_queue_size=11, workers=1, 
                                                    use_multiprocessing=False)

        return loss, accuracy
Example #42
0
def gen_vlen_dataset():
    import h5py
    from fuel.datasets.hdf5 import H5PYDataset

    sizes = numpy.random.randint(3, 9, size=(100,))
    train_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[:90]]
    test_image_features = [numpy.random.randint(256, size=(3, size, size)).astype('uint8') for size in sizes[90:]]

    f = h5py.File('dataset_vlen.h5', mode='w')
    f['vector_features'] = numpy.vstack([numpy.load('train_vector_features.npy'), numpy.load('test_vector_features.npy')])
    f['targets'] = numpy.vstack([numpy.load('train_targets.npy'), numpy.load('test_targets.npy')])

    f['vector_features'].dims[0].label = 'batch'
    f['vector_features'].dims[1].label = 'feature'
    f['targets'].dims[0].label = 'batch'
    f['targets'].dims[1].label = 'index'

    all_image_features = train_image_features + test_image_features
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    image_features = f.create_dataset('image_features', (100,), dtype=dtype)
    image_features[...] = [image.flatten() for image in all_image_features]
    image_features.dims[0].label = 'batch'

    image_features_shapes = f.create_dataset('image_features_shapes', (100, 3), dtype='int32')
    image_features_shapes[...] = numpy.array([image.shape for image in all_image_features])

    image_features.dims.create_scale(image_features_shapes, 'shapes')
    image_features.dims[0].attach_scale(image_features_shapes)

    image_features_shape_labels = f.create_dataset('image_features_shape_labels', (3,), dtype='S7')
    image_features_shape_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')]
    image_features.dims.create_scale(image_features_shape_labels, 'shape_labels')
    image_features.dims[0].attach_scale(image_features_shape_labels)

    split_dict = {'train': {'vector_features': (0, 90), 'image_features': (0, 90), 'targets': (0, 90)},
                  'test': {'vector_features': (90, 100), 'image_features': (90, 100), 'targets': (90, 100)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    train_set = H5PYDataset('dataset_vlen.h5', which_sets=('train',), sources=('image_features',))
    print(train_set.axis_labels['image_features'])
    handle = train_set.open()
    images, = train_set.get_data(handle, slice(0, 10))
    train_set.close(handle)
    print(images[0].shape, images[1].shape, images[2].shape, images[3].shape)
Example #43
0
def make_lsun_dataset(scene_path, fuel_hdf5_path, resize_shape):

    # get image list
    image_list = []
    for root, dirs, files in os.walk(scene_path):
        for filename in fnmatch.filter(files, "*.jpg"):
            image_list.append(os.path.join(root, filename))
    num_images = len(image_list)

    print "num of images :{}".format(num_images)

    # open image file
    fuel_file = h5py.File(name=fuel_hdf5_path, mode="w")

    # set new dataset for fuel file
    image_data = fuel_file.create_dataset(name="image_data", shape=(num_images, 3) + resize_shape, dtype="uint8")

    for idx, filepath in enumerate(image_list):
        original_image = Image.open(filepath).convert("RGB")
        resize_row = resize_shape[0] if original_image.size[0] < original_image.size[1] else original_image.size[0]
        resize_col = resize_shape[1] if original_image.size[0] > original_image.size[1] else original_image.size[1]
        original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS)

        if original_image.size[0] != resize_shape[0]:
            excess = (original_image.size[0] - resize_shape[0]) / 2
            original_image = original_image.crop((excess, 0, resize_shape[0] + excess, resize_shape[0]))
        elif original_image.size[1] != resize_shape[1]:
            excess = (original_image.size[1] - resize_shape[1]) / 2
            original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1] + excess))

        original_image = numpy.asarray(original_image)
        image_data[idx] = numpy.transpose(original_image, (2, 0, 1))

    image_data.dims[0].label = "batch"
    image_data.dims[1].label = "channel"
    image_data.dims[2].label = "height"
    image_data.dims[3].label = "width"

    split_dict = {"train": {"image_data": (0, num_images)}}
    fuel_file.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    fuel_file.flush()
    fuel_file.close()

    print "DONE : {} (num of images :{})".format(fuel_hdf5_path, num_images)
def save_hd5py(dataset_dict, destfile, indices_dict_or_numfolds):
    indices_dict = indices_dict_or_numfolds
    if isinstance(indices_dict, int):
        folds = indices_dict
        n = max(len(it) for it in dataset_dict.values())
        fold_n = n // folds
        indices_dict = dict(('fold_{}'.format(i), (i*fold_n, (i+1)*fold_n)) \
                for i in range(folds))
        print indices_dict
    f = h5py.File(destfile, mode='w')
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
                      for k, v in indices_dict.iteritems())
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
Example #45
0
def test_h5py_dataset_out_of_memory_unsorted_indices():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(
            path='tmp.hdf5', which_set='train', load_in_memory=False,
            sort_indices=False)
        handle = dataset.open()
        assert_raises(TypeError, dataset.get_data, handle, [7, 4, 6, 2, 5])
    finally:
        dataset.close(handle)
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Example #46
0
def save_hd5py (out_path, data, folds = 0):
    images = np.concatenate([a[0] for a in data], axis = 0)
    labels = np.concatenate([a[1] for a in data], axis = 0)
    f = h5py.File(out_path, mode='w')
    ds = f.create_dataset('images', images.shape, dtype=str(images.dtype))
    ds[...] = images
    ds = f.create_dataset('labels', labels.shape, dtype=str(labels.dtype))
    ds[...] = labels
    #assert(folds > 1)
    #if folds > 1:
    fold = len(images) // folds
    idx = {'fold-{}'.format(i): (i*fold, (i+1)*fold) for i in range(folds)}
    print idx
    split_dict = {k: {'images': v, 'labels':v} for k, v in idx.iteritems()}
    from fuel.datasets.hdf5 import H5PYDataset
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    pass
Example #47
0
def test_h5py_dataset_out_of_memory():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(
            path='tmp.hdf5', which_set='train', load_in_memory=False)
        handle = dataset.open()
        assert_equal(
            dataset.get_data(state=handle, request=slice(0, 10))[0],
            numpy.arange(50).reshape((10, 5)))
        dataset.close(handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Example #48
0
File: hdf5.py Project: grappli/pm1
def create_hdf5(np_enc_data, np_enc_y, splitpoint, N):

    hdf5name = 'mushrooms.hdf5'
    f = h5py.File(hdf5name, mode='w')

    fx = f.create_dataset('x', np_enc_data.shape, dtype='float32')
    fy = f.create_dataset('y', np_enc_y.shape, dtype='int64')

    fx[...] = np_enc_data
    fy[...] = np_enc_y

    split_dict = {
        'train': {'x': (0,splitpoint), 'y': (0, splitpoint)},
        'test': {'x': (splitpoint, N), 'y': (splitpoint, N)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
Example #49
0
def make_celeb_dataset(fuel_hdf5_path,
                       resize_shape):
    # get image list
    image_list = glob.glob(CELEBA_FACE_FOLDER + '*.jpg')
    num_images = len(image_list)

    # open image file
    fuel_file     = h5py.File(name=fuel_hdf5_path,
                              mode='w')

    # set new dataset for fuel file
    image_data = fuel_file.create_dataset(name='image_data',
                                          shape=(num_images, 3) + resize_shape,
                                          dtype='uint8')

    for idx, filepath in enumerate(image_list):
        original_image = Image.open(filepath).convert('RGB')
        resize_row = resize_shape[0] if original_image.size[0]<original_image.size[1] else original_image.size[0]
        resize_col = resize_shape[1] if original_image.size[0]>original_image.size[1] else original_image.size[1]
        original_image.thumbnail((resize_row, resize_col), Image.ANTIALIAS)

        if original_image.size[0] != resize_shape[0]:
            excess = (original_image.size[0] - resize_shape[0]) / 2
            original_image = original_image.crop((excess, 0, resize_shape[0]+excess, resize_shape[0]))
        elif original_image.size[1] != resize_shape[1]:
            excess = (original_image.size[1] - resize_shape[1]) / 2
            original_image = original_image.crop((0, excess, resize_shape[1], resize_shape[1]+excess))

        original_image = numpy.asarray(original_image)
        image_data[idx] = numpy.transpose(original_image, (2, 0, 1))

    image_data.dims[0].label = 'batch'
    image_data.dims[1].label = 'channel'
    image_data.dims[2].label = 'height'
    image_data.dims[3].label = 'width'

    split_dict = { 'train' : {'image_data': (0, num_images)}}
    fuel_file .attrs['split'] = H5PYDataset.create_split_array(split_dict)

    fuel_file.flush()
    fuel_file.close()

    print 'DONE : {} (num of images :{})'.format(fuel_hdf5_path, num_images)
Example #50
0
    def test_value_error_on_unequal_sources(self):
        def get_subsets():
            return H5PYDataset(self.h5file, which_sets=('train', )).subsets

        split_dict = {
            'train': {
                'features': (0, 20),
                'targets': (0, 15)
            },
            'test': {
                'features': (20, 30),
                'targets': (20, 30)
            },
            'unlabeled': {
                'features': (30, 100, None, '.')
            }
        }
        self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        assert_raises(ValueError, get_subsets)
def add_split_dict(hdf5file, names, total_examples,
                   train_frac=0.83, valid_frac=0.10):
    # TODO: investiage the "reference" stuff so we can pluck validation
    # and testing events evenly from the sample
    final_train_index = int(total_examples * train_frac)
    final_valid_index = int(total_examples * (train_frac + valid_frac))

    train_dict = {name: (0, final_train_index)
                  for name in names}
    valid_dict = {name: (final_train_index, final_valid_index)
                  for name in names}
    test_dict = {name: (final_valid_index, total_examples)
                 for name in names}
    split_dict = {
        'train': train_dict,
        'valid': valid_dict,
        'test': test_dict
    }
    hdf5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
Example #52
0
def emboot_converter():
    train_vector_features, train_targets, test_vector_features, test_targets = load_emboot_np(
    )
    f = h5py.File(emboot_dataset, mode='w')

    train_sz = train_vector_features.shape[0]
    test_sz = test_vector_features.shape[0]
    feat_sz = train_vector_features.shape[1]
    dataset_sz = train_sz + test_sz
    dataset_sz_new = 13000

    vector_features = f.create_dataset('features', (dataset_sz, feat_sz),
                                       dtype='float64')  ## train + test
    targets = f.create_dataset('targets', (dataset_sz, 1), dtype='uint8')

    ## put the data loaded into these objects
    vector_features[...] = np.vstack(
        [train_vector_features, test_vector_features])
    targets[...] = np.vstack([train_targets, test_targets])

    ## label the dims with names
    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    ## split attribute -- way to recover the splits
    # creating the split using an API
    split_dict = {
        'train': {
            'features': (0, 10400),
            'targets': (0, 10400)
        },
        'test': {
            'features': (10400, 13000),
            'targets': (10400, 13000)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.flush()
    f.close()
Example #53
0
def main():
    print('Loading categories')
    category2idx = json.load(
        open(os.path.join(DATA_PATH, 'categories.json'), 'rt'))

    print('Loading data in memory')
    dataset = H5PYDataset(
        DATA_TEMPLATE % (IMG_SIZE, IMG_SIZE),
        sources=['input_category', 'input_description', 'input_image'],
        which_sets=('all', ),
        load_in_memory=True)

    classes, texts, images = dataset.data_sources
    classes = np.array([cls[0] for cls in classes])
    texts = np.array([txt[0] for txt in texts])

    print("There are %i examples" % dataset.num_examples)
    print(texts.shape, images.shape, classes.shape)
    print("N. examples: %i, fst: %s" % (len(classes), classes[0]))

    # prepare filenames
    print("Creating filenames")
    filenames = create_captions(classes, texts, category2idx, False, False)
    print("N. files: %i, fst: %s" % (len(filenames), filenames[0]))

    train_idx, test_idx, _, _ = get_split(classes,
                                          classes.reshape(-1, 1),
                                          0.1,
                                          seed=2)

    print('Loading embedding model')
    model = Model(os.path.join(MODEL_PATH, 'frozen_model.pb'),
                  os.path.join(MODEL_PATH, 'tokenizer.pickle'),
                  maxlen=LIMIT)

    print('Saving test data')
    dump_all(classes, filenames, images, texts, test_idx, model,
             os.path.join(DATA_PATH, 'test'))

    print('Saving train data')
    dump_all(classes, filenames, images, texts, train_idx, model,
             os.path.join(DATA_PATH, 'train'))
Example #54
0
def test_text_to_h5py_dataset():
    test_dir = tempfile.mkdtemp()
    text_path = os.path.join(test_dir, 'text.txt')
    h5_path = os.path.join(test_dir, 'words.h5')
    with open(os.path.join(test_dir, 'text.txt'), 'w') as dst:
        print('abc', file=dst)
        print('été', file=dst)
        print('abc Δίας', file=dst)
    text_to_h5py_dataset(text_path, h5_path)

    f = H5PYDataset(h5_path, ('train', ))
    it = f.get_example_stream().get_epoch_iterator()
    assert next(it)[0] == 'abc'
    assert next(it)[0] == 'été'
    assert next(it)[0] == 'abc'
    assert next(it)[0] == 'Δίας'

    os.remove(text_path)
    os.remove(h5_path)
    os.rmdir(test_dir)
def add_sets(args):
    with h5py.File(args.h5file, 'a') as h5file:
        sources = []
        for dataset in h5file:
            if (dataset.endswith('_indices') or dataset.endswith('_shapes')
                    or dataset.endswith('_shape_labels')):
                continue
            sources.append(dataset)

        uttid2idx = {
            uttid: idx
            for (idx, uttid) in enumerate(h5file['uttids'])
        }

        split_dict = {}
        for subset in args.sets:
            name, uttids_fname = subset.split('=')
            idxs = []
            with open(uttids_fname) as uf:
                for l in uf:
                    uttid = l.strip().split()[0]
                    idxs.append(uttid2idx[uttid])

            indices_name = '{}_indices'.format(name)

            if indices_name in h5file:
                del h5file[indices_name]

            #
            # Note: ideally, we would sort the indeces and do:
            # h5file[indices_name] = numpy.array(sorted(idxs))
            # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
            #
            h5file[indices_name] = numpy.array(idxs)
            indices_ref = h5file[indices_name].ref
            split_dict[name] = {
                source: (-1, -1, indices_ref)
                for source in sources
            }

        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
Example #56
0
def create_ivector_datastream(path,
                              which_set,
                              batch_size=1,
                              delay=0,
                              min_after_cache=1024,
                              length_sort=False):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                     examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors', 'targets'])

    if length_sort:
        fs = LengthSortTransformer(fs, batch_size, min_after_cache)

    if delay:
        fs = DelayTransformer(fs, delay)
    return Padding(fs)
Example #57
0
def load_stream(batch_size=64, source=None, img=None):
    if source is None:
        raise ValueError('No source provided')
    
    logger.info(
        'Loading data from `{}` (using {}x{}) and quantizing to {} colors'.format(
        source, DIM_X, DIM_Y, N_COLORS))
    
    f = h5py.File(source, 'r')
    arr = f['features'][:1000]
    arr = arr.transpose(0, 2, 3, 1)
    arr = arr.reshape((arr.shape[0] * arr.shape[1], arr.shape[2], arr.shape[3]))
    img = Image.fromarray(arr).convert(
        'P', palette=Image.ADAPTIVE, colors=N_COLORS)

    train_data = H5PYDataset(source, which_sets=('train',))
    num_train = train_data.num_examples

    train_scheme = ShuffledScheme(examples=num_train, batch_size=batch_size)
    train_stream = To8Bit(img=img, data_stream=DataStream(
        train_data, iteration_scheme=train_scheme))
    return train_stream, num_train, img
Example #58
0
def make_gen(Nchunks=True, classif=False, train=True):
    '''
        Nchunks==True : 10 chunks in the generator
        Nchunks == False : 1 chunk in the generator
        Makes the distinction between classification/regression
        Makes the distinction between test/train
    '''

    file_path_f = file_path_R
    shift_f = shift_R
    scale_f = scale_R
    if classif:
        file_path_f = file_path_C
        shift_f = shift_C
        scale_f = scale_C

    if Nchunks:
        batch_size = 13
    else:
        batch_size = 130
    t_scheme = SequentialScheme(examples=130, batch_size=batch_size)
    t_source = 'train'
    if not train:
        if Nchunks:
            batch_size = 2
        else:
            batch_size = 20
        t_source = 'test'
        t_scheme = SequentialScheme(examples=20, batch_size=batch_size)

    t_set = H5PYDataset(file_path_f, which_sets=[t_source])
    data_stream_t = DataStream(dataset=t_set, iteration_scheme=t_scheme)

    stand_stream_t = ScaleAndShift(data_stream=data_stream_t,
                                   scale=scale_f, shift=shift_f,
                                   which_sources=t_source)

    return stand_stream_t, t_set, data_stream_t
Example #59
0
def remove_files_from_dataset(hdf5filename1, subset, noms, noms_to_remove, output_):
    from fuel.datasets.hdf5 import H5PYDataset

    set1 = H5PYDataset(hdf5filename1, which_sets=(subset,))
    print 'before:', set1.num_examples

    handle1 = set1.open()
    data1 = set1.get_data(handle1, slice(0, set1.num_examples))
    set1.close(handle1)

    Xarray = []
    yarray = []

    for ind, nom in enumerate(noms):
        if nom in noms_to_remove: continue
        Xarray.append(data1[0][ind])
        yarray.append(data1[1][ind])
    Xarray = np.asarray(Xarray)
    yarray = np.asarray(yarray)

    print 'after:', Xarray.shape

    create_hdf5_from_arrays(Xarray, yarray, output_)
Example #60
0
def createH5Dataset(hdf5_out, corpus_path, sequence_length):
    with open(corpus_path) as f:
        corpus = f.read().split(",")

    (indices, vocab) = pd.factorize(list(corpus))

    instances_num = len(corpus) // (sequence_length + 1)

    f = h5py.File(hdf5_out, mode='w')

    train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8)
    train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8)

    for j in range(instances_num):
        for i in range(sequence_length):
            train_data_x[j][i] = indices[i + j * (sequence_length + 1)]
            train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1]

    char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8')
    char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8')

    char_in[...] = train_data_x
    char_out[...] = train_data_y

    split_dict = {
        'train': {
            'inchar': (0, instances_num),
            'outchar': (0, instances_num)
        }
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.attrs["vocab"] = json.dumps(list(vocab))

    f.flush()
    f.close()