def valid(self, req_vars): valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5') train_dataset = TaxiDataset('train') valid_trips_ids = valid_dataset.get_data( None, slice(0, valid_dataset.num_examples))[ valid_dataset.sources.index('trip_id')] prefix_stream = DataStream(valid_dataset, iteration_scheme=SequentialExampleScheme( valid_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = DataStream(train_dataset, iteration_scheme=ShuffledExampleScheme( train_dataset.num_examples)) candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) candidate_stream = Batch(candidate_stream, iteration_scheme=ConstantScheme( self.config.valid_candidate_size)) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def get_datastream(dataset, batch_size=160): dataset = DataStream( dataset, iteration_scheme=ShuffledScheme(dataset.num_examples, batch_size), ) dataset = Padding(dataset) # if flatten: # dataset = Flatten(dataset, which_sources=('features,')) def _transpose(data): return tuple(np.rollaxis(array, 1, 0) for array in data) dataset = Mapping(dataset, _transpose) return dataset
def _get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, **kwargs): """Setup development set stream if necessary. The arguments to this method are given by the configuration dict. """ dev_stream = None if val_set is not None: src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) data_mean_std = numpy.load(norm_path) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def setUp(self): rng = numpy.random.RandomState(123) self.stream = DataStream(IndexableDataset(OrderedDict([ ('features', rng.rand(4, 2, 2)), ('targets', numpy.array([0, 1, 0, 1])) ]), axis_labels={ 'features': ('batch', 'width', 'height'), 'targets': ('batch', ) }), iteration_scheme=SequentialScheme(4, 2)) self.duplicate = Duplicate(self.stream, 'features')
def setUp(self): self.data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, 1, 2], [1, 0, 1], [1, 1, 1], [2, 0, 0]]))]) self.neg_data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, -1, 2], [1, 0, -3], [1, 1, 1], [2, 0, 0]]))]) self.num_classes = (3, 2, 3) self.stream_example = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, which_sources=('targets', ))
def create_gaussian_mixture_data_streams(batch_size, monitoring_batch_size, means=None, variances=None, priors=None, rng=None, num_examples=100000, sources=('features', )): train_set = GaussianMixture(num_examples=num_examples, means=means, variances=variances, priors=priors, rng=rng, sources=sources) valid_set = GaussianMixture(num_examples=num_examples, means=means, variances=variances, priors=priors, rng=rng, sources=sources) main_loop_stream = DataStream( train_set, iteration_scheme=ShuffledScheme( train_set.num_examples, batch_size=batch_size, rng=rng)) train_monitor_stream = DataStream( train_set, iteration_scheme=ShuffledScheme(5000, batch_size, rng=rng)) valid_monitor_stream = DataStream( valid_set, iteration_scheme=ShuffledScheme(5000, batch_size, rng=rng)) return main_loop_stream, train_monitor_stream, valid_monitor_stream
def test_padding_data_stream(): # 1-D sequences stream = Batch( DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])), ConstantScheme(2)) mask_stream = Padding(stream) assert mask_stream.sources == ("data", "data_mask") it = mask_stream.get_epoch_iterator() data, mask = next(it) assert (data == numpy.array([[1, 0], [2, 3]])).all() assert (mask == numpy.array([[1, 0], [1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all() assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[7]])).all() assert (mask == numpy.array([[1]])).all() # 2D sequences stream2 = Batch( DataStream(IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])), ConstantScheme(2)) it = Padding(stream2).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all() # 2 sources stream3 = Padding(Batch( DataStream(IterableDataset(dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(stream3.get_epoch_iterator())) == 4
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if delay: fs = DelayTransformer(fs, delay) fs = FilterSources(data_stream=fs, sources=['features', 'ivectors']) return Padding(fs)
def train(model=None): if model is not None: trainset = H5PYDataset('svhn_format_2.hdf5', which_sets=('train',), sources=('features', 'targets')) trainstream = DataStream(trainset, iteration_scheme=SequentialScheme(examples=trainset.num_examples, batch_size=500)) for data in trainstream.get_epoch_iterator(): images, labels = data #standardize the input images m = images.mean(axis=(2,3), keepdims=True) s = images.std(axis=(2,3), keepdims=True) images = (images - m)/s #change from "channel_first" to "channel_last" images = np.transpose(images, (0,2,3,1)) labels = keras.utils.to_categorical(labels) #print images.shape model.train_on_batch(x=images, y=labels) trainstream.close()
def test_h5py_dataset_datastream_pickles(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') stream = DataStream(dataset) pickle.loads(pickle.dumps(stream)) finally: stream.close() if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def get_stream(input_file, vocab_file, **kwards): unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') dataset = TextFile(files=[input_file], encoding='UTF-8', preprocess=to_lower_case, dictionary=pkl.load(open(vocab_file, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) stream = DataStream(dataset) return stream
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def candidate_stream(self, n_candidates): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) return Batch(candidate_stream, iteration_scheme=ConstantScheme(n_candidates))
def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, )
def infer(path, ae_encode): ''' :param path: path of infer data :param ae_encode: compiled theano function :return: image saved path in string ''' hf = h5py.File(path, 'r+') split_dict = { 'test': { 'input': (0, 1), 'target': (0, 1) }, } hf.attrs['split'] = H5PYDataset.create_split_array(split_dict) test_set = H5PYDataset(path, which_sets=('test', )) batch_size = 1 test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size) test_stream = DataStream(test_set, iteration_scheme=test_scheme) for te_train, te_target in test_stream.get_epoch_iterator(): break te_out, te_ta = ae_encode(input_transform(te_train), target_transform(te_target)) te_reshape = inverse(te_out) te_target_reshape = inverse(te_ta) new_size = (128 * 2, 160) new_im = Image.new('RGB', new_size) r = np.random.choice(1, 1, replace=False).reshape(1, 1) for i in range(1): for j in range(1): index = r[i][j] target_im = Image.fromarray(te_target_reshape[index]) train_im = Image.fromarray(te_train[index].astype(np.uint8)) im = Image.fromarray(te_reshape[index]) new_im.paste(train_im, (128 * (i * 2), 160 * j)) new_im.paste(im, (128 * (i * 2 + 1), 160 * j)) img_loc = "gen_images/%i.png" % int(time()) new_im.save(img_loc) return img_loc
def test_data_stream_mapping_sort_multisource_ndarrays(): data = OrderedDict() data['x'] = [numpy.array([1, 2, 3]), numpy.array([2, 3, 1]), numpy.array([3, 2, 1])] data['y'] = [numpy.array([6, 5, 4]), numpy.array([6, 5, 4]), numpy.array([6, 5, 4])] data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])), (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])), (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))] stream = DataStream(IterableDataset(data)) wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted): assert len(output) == len(ground_truth) assert (output[0] == ground_truth[0]).all() assert (output[1] == ground_truth[1]).all()
def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size)) # Data Augmentation stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = RandomHorizontalSwap(stream, which_sources=('image_features',)) stream = Random2DRotation(stream, which_sources=('image_features',)) #stream = ScikitResize(stream, image_size, which_sources=('image_features',)) # Data Preprocessing # Data Transformation stream = ScaleAndShift(stream, 1./255, 0, which_sources=('image_features',)) stream = Cast(stream, dtype='float32', which_sources=('image_features',)) return stream
def get_tst_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): tst_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) tst_dataset = TextFile([val_set], src_vocab, None) tst_stream = DataStream(tst_dataset) return tst_stream
def get_stream(self, part, batch_size=None, max_length=None, seed=None): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() stream = SourcewiseMapping(stream, functools.partial(add_bos, Vocabulary.BOS)) stream = SourcewiseMapping(stream, vectorize) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream) return stream
def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20)
def make_scheme_and_stream(dset, batchsize, msg_string, shuffle=True): """ dset is a Fuel `DataSet` and batchsize is an int representing the number of examples requested per minibatch """ if shuffle: print(msg_string + " Preparing shuffled datastream for {} examples.".format( dset.num_examples)) scheme = ShuffledScheme(examples=dset.num_examples, batch_size=batchsize) else: print(msg_string + "Preparing sequential datastream for {} examples.".format( dset.num_examples)) scheme = SequentialScheme(examples=dset.num_examples, batch_size=batchsize) data_stream = DataStream(dataset=dset, iteration_scheme=scheme) return scheme, data_stream
def get_comb_stream(fea2obj, which_set, batch_size=None, shuffle=True): streams = [] for fea in fea2obj: obj = fea2obj[fea] dataset = H5PYDataset(obj.fuelfile, which_sets=(which_set,),load_in_memory=True) if batch_size == None: batch_size = dataset.num_examples if shuffle: iterschema = ShuffledScheme(examples=dataset.num_examples, batch_size=batch_size) else: iterschema = SequentialScheme(examples=dataset.num_examples, batch_size=batch_size) stream = DataStream(dataset=dataset, iteration_scheme=iterschema) if fea in seq_features: stream = CutInput(stream, obj.max_len) if obj.rec == True: logger.info('transforming data for recursive input') stream = LettersTransposer(stream, which_sources=fea)# Required because Recurrent bricks receive as input [sequence, batch,# features] streams.append(stream) stream = Merge(streams, tuple(fea2obj.keys())) return stream, dataset.num_examples
def get_dev_streams(config): """Setup development set stream if necessary.""" dev_streams = {} for cg in config['cgs']: if 'val_sets' in config and cg in config['val_sets']: logger.info('Building development stream for cg:[{}]'.format(cg)) eid = p_(cg)[0] dev_file = config['val_sets'][cg] # Get dictionary and fix EOS dictionary = cPickle.load(open(config['src_vocabs'][eid])) dictionary['<S>'] = 0 dictionary['<UNK>'] = config['unk_id'] dictionary['</S>'] = config['src_eos_idxs'][eid] # Get as a text file and convert it into a stream dev_dataset = TextFile([dev_file], dictionary, None) dev_streams[cg] = DataStream(dev_dataset) return dev_streams
def valid(self, req_vars): prefix_stream = DataStream(self.valid_dataset, iteration_scheme=SequentialExampleScheme( self.valid_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = self.candidate_stream( self.config.valid_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def setup_cnnsquad_datastream(sq_path, cnn_path, vocab_file, config): ds = CNNSQDataset(sq_path, cnn_path, vocab_file) it = CNNSQIterator(sq_path, cnn_path, cnn_ratio=config.add_cnn_data) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'answer'], mask_dtype='int32') return ds, stream
def test_auc_monitor(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) test_probs = shared_floatx( numpy.array([ [0.0, 0.0, 1.0], [0.75, 0.25, 0.0], [0.0, 0.75, 0.25], [0.25, 0.75, 0.0], ], dtype=floatX)) targets = shared_floatx( numpy.array([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], dtype=floatX)) auc_monitor = AUCMonitor(datastream, test_probs, targets) auc_monitor.main_loop = setup_mainloop([]) auc_monitor.do('after_batch') assert_allclose(auc_monitor.main_loop.log[0]['auc'], 0.81944444444444453)
def setup_toy_datastream(config): ds = ToyDataset() it = ToyIterator() stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding( stream, mask_sources=['context', 'question', 'answer', 'ans_indices'], mask_dtype='int32') return ds, stream
def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features', )) stream_max = ScikitResize(stream_rotate, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast
def _construct_sequential_stream(self, dataset, for_type='train'): '''Construc a sequencial stream from an IndexableDataset object Subclass should add transformation on the stream, e.g., 1.Sort samples by size 2.Batch dataset 3.Add mask on samples :param dataset: fuel.IndexableDataset This is constructed by self._construct_dataset method. :return: fuel.stream.Datastream An object of fuel.stream.Datastream with SequentialExampleScheme A fuel sequential stream with basic transformations, ''' it = SequentialExampleScheme(dataset.num_examples) stream = DataStream(dataset, iteration_scheme=it) # # Batch examples # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) # Add mask on inputs # for source in self.need_mask_sources.iteritems(): # stream = Padding(stream, mask_sources=[source[0]], mask_dtype=source[1]) return stream