Ejemplo n.º 1
0
    def test_transform_source_example(self):
        assert_equal(list(self.stream_example.get_epoch_iterator()),
                     [(numpy.ones(
                         (2, 2)), numpy.array([[1, 0, 0, 0, 1, 0, 0, 1]])),
                      (numpy.ones(
                          (2, 2)), numpy.array([[0, 1, 0, 1, 0, 0, 1, 0]])),
                      (numpy.ones(
                          (2, 2)), numpy.array([[0, 1, 0, 0, 1, 0, 1, 0]])),
                      (numpy.ones(
                          (2, 2)), numpy.array([[0, 0, 1, 1, 0, 1, 0, 0]]))])

        stream_example_invalid = StructuredOneHotEncoding(
            DataStream(IndexableDataset(self.data),
                       iteration_scheme=SequentialExampleScheme(4)),
            num_classes=[2, 3, 3],
            which_sources=('targets', ))

        assert_raises(ValueError, list,
                      stream_example_invalid.get_epoch_iterator())

        source_example_negative = StructuredOneHotEncoding(
            DataStream(IndexableDataset(self.neg_data),
                       iteration_scheme=SequentialExampleScheme(4)),
            num_classes=self.num_classes,
            which_sources=('targets', ))

        assert_raises(ValueError, list,
                      source_example_negative.get_epoch_iterator())
Ejemplo n.º 2
0
def test_concatenated_scheme_infers_request_type():
    assert not ConcatenatedScheme(schemes=[
        ConstantScheme(batch_size=10, times=5),
        ConstantScheme(batch_size=10, times=5)
    ]).requests_examples
    assert ConcatenatedScheme(schemes=[
        SequentialExampleScheme(examples=10),
        SequentialExampleScheme(examples=10)
    ]).requests_examples
Ejemplo n.º 3
0
    def test(self, req_vars):
        prefix_stream = DataStream(self.test_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.test_dataset.num_examples))
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)

        if not data.tvt:
            prefix_stream = transformers.taxi_remove_test_only_clients(
                prefix_stream)

        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = self.candidate_stream(
            self.config.test_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
Ejemplo n.º 4
0
    def __init__(self,
                 bs=64,
                 num=82611,
                 dataset='train',
                 img_size=32,
                 lang_N=57,
                 **kwargs):
        self.provides_sources = ('features', 'captions', 'mask')
        super(MSCoco, self).__init__(**kwargs)
        self.num_examples = num + (num % bs)
        self.bs = bs
        self.num = num
        self.lang_N = lang_N

        self.example_iteration_scheme = SequentialExampleScheme(
            self.num_examples)
        self.index = 0
        self.done = False

        self.imgs = np.load('coco/' + dataset + '-images-' + str(img_size) +
                            'x' + str(img_size) + '.npy')
        self.imgs = self.imgs.reshape((num, 3, img_size * img_size))
        self.caps = np.load('coco/' + dataset + '-captions.npy')
        self.caps = self.caps.reshape(
            (num, self.caps.shape[0] / num, self.caps.shape[-1]))
        if self.caps.shape[-1] != self.lang_N:
            c = np.zeros((num, self.caps.shape[1], self.lang_N))
            c[:, :, :self.caps.shape[-1]] = self.caps
            self.caps = c

        self.images = np.zeros((bs, 3, img_size * img_size)).astype('float32')
        self.captions = np.zeros((bs, self.caps.shape[-1])).astype(int)
        self.mask = np.ones((bs, self.caps.shape[-1])).astype(int)
Ejemplo n.º 5
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        datastream.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    train_stream = DataStream.default_stream(
        dataset=datastream,
        iteration_scheme=SequentialExampleScheme(datastream.num_examples))

    it = train_stream.get_epoch_iterator()
    return it
Ejemplo n.º 6
0
    def __init__(self, file_or_path, which_sets, subset=None,
                 load_in_memory=False, driver=None, sort_indices=True,
                 **kwargs):
        if isinstance(file_or_path, h5py.File):
            self.path = file_or_path.filename
            self.external_file_handle = file_or_path
        else:
            self.path = file_or_path
            self.path = cache_file(self.path)
            self.external_file_handle = None
        which_sets_invalid_value = (
            isinstance(which_sets, six.string_types) or
            not all(isinstance(s, six.string_types) for s in which_sets))
        if which_sets_invalid_value:
            raise ValueError('`which_sets` should be an iterable of strings')
        self.which_sets = which_sets
        self.user_given_subset = subset if subset else slice(None)
        self.load_in_memory = load_in_memory
        self.driver = driver
        self.sort_indices = sort_indices

        self._parse_dataset_info()

        kwargs.setdefault('axis_labels', self.default_axis_labels)
        super(H5PYDataset, self).__init__(**kwargs)

        # It is really important to do it here, because self.num_examples
        # call will cause a crash if done before calling
        # super(...).__init__
        self.example_iteration_scheme = SequentialExampleScheme(
            self.num_examples)
Ejemplo n.º 7
0
    def valid(self, req_vars):
        prefix_stream = DataStream(self.valid_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.valid_dataset.num_examples))

        #prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)

        prefix_stream = transformers.balanced_batch(
            prefix_stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)

        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.valid_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)

        return stream
Ejemplo n.º 8
0
 def test_flatten_examples(self):
     wrapper = Flatten(DataStream(
         IndexableDataset(self.data),
         iteration_scheme=SequentialExampleScheme(4)),
                       which_sources=('features', ))
     assert_equal(list(wrapper.get_epoch_iterator()),
                  [(numpy.ones(4), 0), (numpy.ones(4), 1)] * 2)
    def __init__(self, banned, bs=64, num=10000, dataset='train', **kwargs):
        self.provides_sources = ('features', 'captions', 'mask')
        super(CaptionedMNIST, self).__init__(**kwargs)
        self.num_examples = num + (num % bs)
        self.bs = bs

        self.example_iteration_scheme = SequentialExampleScheme(
            self.num_examples)
        self.index = -1
        self.done = False
        self.banned = banned
        f = gzip.open('mnist.pkl.gz', 'rb')
        train_set, valid_set, test_set = cPickle.load(f)
        f.close()

        if dataset == 'train':
            self.labels = train_set[1]
            self.data = train_set[0]
        elif dataset == 'valid':
            self.labels = valid_set[1]
            self.data = valid_set[0]
        elif dataset == 'test':
            self.labels = test_set[1]
            self.data = test_set[0]
        print self.labels.shape

        self.images = np.zeros((bs, 60 * 60)).astype('float32')
        self.captions = np.zeros((bs, 12)).astype(int)
        self.mask = np.ones((bs, 12)).astype(int)
Ejemplo n.º 10
0
def get_stream_raw(dataset, which_set, mini_batch_size):
    data = get_data(dataset)

    # dataset is a 3D array of shape: Time X Batch X Features
    dataset = data[which_set]
    time, batch, features = dataset.shape
    nb_mini_batches = batch / mini_batch_size
    dataset = dataset[:, :nb_mini_batches * mini_batch_size, :]

    # Create the target_dataset
    targets_dataset = dataset[1:, :, :]

    # Cut the dataset into several minibatches
    # dataset is now 4D (nb_mini_batches X Time X mini_batch_size X Features)
    dataset = numpy.swapaxes(dataset, 0, 1)
    targets_dataset = numpy.swapaxes(targets_dataset, 0, 1)
    dataset = numpy.reshape(dataset,
                            (nb_mini_batches, mini_batch_size, time, features))
    targets_dataset = numpy.reshape(
        targets_dataset,
        (nb_mini_batches, mini_batch_size, time - 1, features))
    dataset = numpy.swapaxes(dataset, 1, 2)
    targets_dataset = numpy.swapaxes(targets_dataset, 1, 2)

    # Create fuel dataset
    dataset = IndexableDataset({
        'features': dataset,
        'targets': targets_dataset
    })
    stream = DataStream(
        dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches))
    return stream
Ejemplo n.º 11
0
 def test_one_hot_examples_invalid_inputs(self):
     wrapper = OneHotEncoding(DataStream(
         IndexableDataset(self.data),
         iteration_scheme=SequentialExampleScheme(4)),
                              num_classes=2,
                              which_sources=('targets', ))
     assert_raises(ValueError, list, wrapper.get_epoch_iterator())
Ejemplo n.º 12
0
    def test(self, req_vars):
        prefix_stream = DataStream(self.test_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.test_dataset.num_examples))

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        if not data.tvt:
            prefix_stream = transformers.taxi_remove_test_only_clients(
                prefix_stream)

        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))
        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.test_candidate_size, False)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)

        return stream
Ejemplo n.º 13
0
def test_progressbar_iter_per_epoch_indices():
    iter_per_epoch = 100
    progress_bar = ProgressBar()
    main_loop = setup_mainloop(
        None, iteration_scheme=SequentialExampleScheme(iter_per_epoch))
    progress_bar.main_loop = main_loop

    assert progress_bar.get_iter_per_epoch() == iter_per_epoch
Ejemplo n.º 14
0
def get_dev_stream(valid_file, **kwargs):
    valid_data = cPickle.load(open(valid_file))
    images = [example[0] for example in valid_data]
    targets = [example[1] for example in valid_data]
    dataset = IndexableDataset(
        OrderedDict([('input', images), ('output', targets)]))
    return DataStream(dataset,
                      iteration_scheme=SequentialExampleScheme(len(images)))
Ejemplo n.º 15
0
 def common_setup(self):
     ex_scheme = SequentialExampleScheme(self.dataset.num_examples)
     self.example_stream = DataStream(self.dataset,
                                      iteration_scheme=ex_scheme)
     self.batch_size = 2
     scheme = ShuffledScheme(self.dataset.num_examples,
                             batch_size=self.batch_size)
     self.batch_stream = DataStream(self.dataset, iteration_scheme=scheme)
Ejemplo n.º 16
0
 def test_axis_labels_on_flatten_examples(self):
     wrapper = Flatten(
         DataStream(IndexableDataset(self.data),
                    iteration_scheme=SequentialExampleScheme(4),
                    axis_labels={'features': ('batch', 'width', 'height'),
                                 'targets': ('batch', 'index')}),
         which_sources=('features',))
     assert_equal(wrapper.axis_labels, {'features': ('feature',),
                                        'targets': ('index',)})
Ejemplo n.º 17
0
 def __init__(self,
              which_set,
              filename='data.hdf5',
              iteration_scheme=None,
              **kwargs):
     dataset = TaxiDataset(which_set, filename, **kwargs)
     if iteration_scheme is None:
         iteration_scheme = SequentialExampleScheme(dataset.num_examples)
     super(TaxiStream, self).__init__(dataset,
                                      iteration_scheme=iteration_scheme)
Ejemplo n.º 18
0
def setup_datastream(batch_size, **kwargs):
    ds = ToyDataset(**kwargs)
    stream = DataStream(ds,
                        iteration_scheme=SequentialExampleScheme(
                            kwargs['nb_examples']))

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Ejemplo n.º 19
0
 def test_one_hot_examples(self):
     wrapper = OneHotEncoding(DataStream(
         IndexableDataset(self.data),
         iteration_scheme=SequentialExampleScheme(4)),
                              num_classes=4,
                              which_sources=('targets', ))
     assert_equal(list(wrapper.get_epoch_iterator()),
                  [(numpy.ones((2, 2)), numpy.array([[1, 0, 0, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 1, 0, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 0, 1, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 0, 0, 1]]))])
Ejemplo n.º 20
0
    def test_ignore_groups(self):
        stream_example = StructuredOneHotEncoding(DataStream(
            IndexableDataset(self.data),
            iteration_scheme=SequentialExampleScheme(4)),
                                                  num_classes=self.num_classes,
                                                  ignore_groups=[0, 2],
                                                  which_sources=('targets', ))

        assert_equal(list(stream_example.get_epoch_iterator()),
                     [(numpy.ones((2, 2)), numpy.array([[0, 1]])),
                      (numpy.ones((2, 2)), numpy.array([[1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1]])),
                      (numpy.ones((2, 2)), numpy.array([[1, 0]]))])

        stream_example2 = StructuredOneHotEncoding(
            DataStream(IndexableDataset(self.data),
                       iteration_scheme=SequentialExampleScheme(4)),
            num_classes=self.num_classes,
            ignore_groups=[1, 2],
            which_sources=('targets', ))

        assert_equal(list(stream_example2.get_epoch_iterator()),
                     [(numpy.ones((2, 2)), numpy.array([[1, 0, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 0, 1]]))])

        stream_batch = StructuredOneHotEncoding(DataStream(
            IndexableDataset(self.data),
            iteration_scheme=SequentialScheme(4, 2)),
                                                num_classes=self.num_classes,
                                                ignore_groups=[0, 2],
                                                which_sources=('targets', ))

        assert_equal(list(stream_batch.get_epoch_iterator()), [
            (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])),
            (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])),
        ])
Ejemplo n.º 21
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Ejemplo n.º 22
0
 def setUp(self):
     self.data = OrderedDict([('features', numpy.ones((4, 2, 2))),
                              ('targets',
                               numpy.array([[0, 1, 2], [1, 0, 1], [1, 1, 1],
                                            [2, 0, 0]]))])
     self.neg_data = OrderedDict([('features', numpy.ones((4, 2, 2))),
                                  ('targets',
                                   numpy.array([[0, -1, 2], [1, 0, -3],
                                                [1, 1, 1], [2, 0, 0]]))])
     self.num_classes = (3, 2, 3)
     self.stream_example = StructuredOneHotEncoding(
         DataStream(IndexableDataset(self.data),
                    iteration_scheme=SequentialExampleScheme(4)),
         num_classes=self.num_classes,
         which_sources=('targets', ))
Ejemplo n.º 23
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True,
                         label_transforms=False,
                         return_length=False):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        h5_dataset.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    datastream = DataStream.default_stream(
        dataset=h5_dataset,
        iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples))

    if label_transforms:
        # TODO: maybe refactor this common bit with get_custom_streams below
        datastream = AddLabelUncertainty(datastream,
                                         chance=0,
                                         which_sources=('targets', ))

        datastream = RandomLabelStrip(datastream,
                                      chance=0,
                                      which_sources=('targets', ))

        # HACK: allow variable stretch
        datastream = StretchLabels(datastream,
                                   length=128,
                                   which_sources=('targets', ))

    it = datastream.get_epoch_iterator()
    if return_length:
        return it, h5_dataset.num_examples
    else:
        return it
Ejemplo n.º 24
0
    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )
Ejemplo n.º 25
0
def stream_handwriting(
        which_sets,
        batch_size,
        seq_size,
        num_letters,
        sorting_mult=20):

    assert sorting_mult > 0

    dataset = Handwriting(which_sets)
    sorting_size = batch_size * sorting_mult
    num_examples = sorting_size * (dataset.num_examples / sorting_size)

    if which_sets == ('train',):
        print "Random order."
        scheme = ShuffledExampleScheme(num_examples)
    else:
        print "Sequential order."
        scheme = SequentialExampleScheme(num_examples)

    data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme)

    # Sort by length of the data sequence.
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(sorting_size))
    data_stream = Mapping(data_stream, SortMapping(_length))
    data_stream = Unpack(data_stream)
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(batch_size))

    data_stream = Padding(data_stream)
    data_stream = SourceMapping(
        data_stream, _transpose, which_sources=('features', 'features_mask'))
    data_stream = SegmentSequence(
        data_stream,
        seq_size=seq_size + 1,
        share_value=True,
        return_last=True,
        which_sources=('features', 'features_mask'),
        add_flag=True)
    return data_stream
Ejemplo n.º 26
0
    def _construct_sequential_stream(self, dataset, for_type='train'):
        '''Construc a sequencial stream from an IndexableDataset object

        Subclass should add transformation on the stream, e.g.,
                1.Sort samples by size
                2.Batch dataset
                3.Add mask on samples
        :param dataset: fuel.IndexableDataset
                This is constructed by self._construct_dataset method.
        :return: fuel.stream.Datastream
                An object of fuel.stream.Datastream with SequentialExampleScheme
                A fuel sequential stream with basic transformations,
        '''
        it = SequentialExampleScheme(dataset.num_examples)
        stream = DataStream(dataset, iteration_scheme=it)
        # # Batch examples
        # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        # Add mask on inputs
        # for source in self.need_mask_sources.iteritems():
        #     stream = Padding(stream, mask_sources=[source[0]], mask_dtype=source[1])
        return stream
Ejemplo n.º 27
0
    def __init__(self, indexables, start=None, stop=None, **kwargs):
        if isinstance(indexables, dict):
            self.provides_sources = tuple(indexables.keys())
        else:
            self.provides_sources = ('data',)
        super(IndexableDataset, self).__init__(**kwargs)
        if isinstance(indexables, dict):
            self.indexables = [indexables[source][start:stop]
                               for source in self.sources]
            if not all(len(indexable) == len(self.indexables[0])
                       for indexable in self.indexables):
                raise ValueError("sources have different lengths")
        else:
            self.indexables = [indexables]

        self.example_iteration_scheme = SequentialExampleScheme(
            self.num_examples)

        self.start = start
        self.stop = stop
        self.subset = Subset(slice(start, stop), self.num_examples)
Ejemplo n.º 28
0
def get_stream_char(dataset,
                    which_set,
                    time_length,
                    mini_batch_size,
                    total_train_chars=None):
    data = get_data(dataset)

    # dataset is one long string containing the whole sequence of indexes
    dataset = data[which_set]
    if total_train_chars is None:
        total_train_chars = dataset.shape[0]

    nb_mini_batches = total_train_chars / (mini_batch_size * time_length)
    total_train_chars = nb_mini_batches * mini_batch_size * time_length

    dataset = dataset[:total_train_chars]

    dataset = dataset.reshape(mini_batch_size,
                              total_train_chars / mini_batch_size)
    dataset = dataset.T

    targets_dataset = dataset[1:, :]
    targets_dataset = numpy.concatenate(
        (targets_dataset, numpy.zeros(
            (1, mini_batch_size)).astype(numpy.int64)),
        axis=0)

    dataset = dataset.reshape(nb_mini_batches, time_length, mini_batch_size)
    targets_dataset = targets_dataset.reshape(nb_mini_batches, time_length,
                                              mini_batch_size)

    dataset = IndexableDataset({
        'features': dataset,
        'targets': targets_dataset
    })
    stream = DataStream(
        dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches))
    return stream
Ejemplo n.º 29
0
def get_stream_char(dataset, which_set, time_length, mini_batch_size,
                    total_train_chars=None):
    data = get_data(dataset)
    dataset = data[which_set]
    if total_train_chars is None:
        total_train_chars = dataset.shape[0]

    nb_mini_batches = total_train_chars / (mini_batch_size * time_length)
    total_train_chars = nb_mini_batches * mini_batch_size * time_length

    dataset = dataset[:total_train_chars]

    dataset = dataset.reshape(
        mini_batch_size, total_train_chars / mini_batch_size)
    dataset = dataset.T

    targets_dataset = dataset[1:, :]
    targets_dataset = numpy.concatenate(
        (targets_dataset,
         numpy.zeros((1, mini_batch_size)).astype(numpy.int64)), axis=0)

    dataset = dataset.reshape(
        total_train_chars / (mini_batch_size * time_length),
        time_length, mini_batch_size)
    targets_dataset = targets_dataset.reshape(
        total_train_chars / (mini_batch_size * time_length),
        time_length, mini_batch_size)
    # print dataset.shape
    # print targets_dataset.shape
    dataset = IndexableDataset({'features': dataset,
                                'targets': targets_dataset})
    stream = DataStream(dataset,
                        iteration_scheme=SequentialExampleScheme(
                            nb_mini_batches))
    # stream = MakeRecurrent(time_length, stream)
    return stream, total_train_chars
Ejemplo n.º 30
0
    def valid(self, req_vars):
        valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5')
        train_dataset = TaxiDataset('train')
        valid_trips_ids = valid_dataset.get_data(
            None, slice(0, valid_dataset.num_examples))[
                valid_dataset.sources.index('trip_id')]

        prefix_stream = DataStream(valid_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       valid_dataset.num_examples))
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = DataStream(train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          train_dataset.num_examples))
        candidate_stream = transformers.TaxiExcludeTrips(
            candidate_stream, valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        candidate_stream = Batch(candidate_stream,
                                 iteration_scheme=ConstantScheme(
                                     self.config.valid_candidate_size))

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream