Example #1
0
 def test_two_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert len(next(transformer.get_epoch_iterator())) == 4
Example #2
0
 def test_two_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert len(next(transformer.get_epoch_iterator())) == 4
 def test_mask_dtype(self):
     transformer = Padding(
         Batch(
             DataStream(IterableDataset(dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2)
         ),
         mask_dtype="uint8",
     )
     assert_equal(str(next(transformer.get_epoch_iterator())[1].dtype), "uint8")
Example #4
0
    def indexData(self):
        labCounts = graph_helper.getLabelCounts(
            self.G, self.trainNodes + self.validationNodes)
        trainXY, trainIDs = encode_data_VarLen(
            self.G,
            self.trainNodes,
            self.attrKey,
            self.maxNeighbors,
            usePrevWeights=self.usePrevWeights,
            useActualLabs=self.useActualLabs,
            onlyLabs=self.onlyLabs,
            useInputX2=self.useInputX2,
            labCounts=labCounts,
            dataAug=self.dataAug,
            pageRankOrder=self.pageRankOrder,
            usePro=self.usePro,
            lastH=self.lastHH,
            nodeIDs=True)
        validationXY, testIDs = encode_data_VarLen(
            self.G,
            self.validationNodes,
            self.attrKey,
            self.maxNeighbors,
            labCounts=labCounts,
            usePrevWeights=self.usePrevWeights,
            useActualLabs=self.useActualLabs,
            onlyLabs=self.onlyLabs,
            useInputX2=self.useInputX2,
            pageRankOrder=self.pageRankOrder,
            usePro=self.usePro,
            lastH=self.lastHH,
            nodeIDs=True)
        self.input_dimx1 = trainXY['x'][0].shape[1]
        if 'x2' in trainXY:
            self.input_dimx2 = trainXY['x2'].shape[1]

        dataset_train = IndexableDataset(trainXY)
        dataset_valid = IndexableDataset(validationXY)
        self.num_examples_train = dataset_train.num_examples
        self.num_examples_valid = dataset_valid.num_examples
        if self.usePro:
            transpose_stream = self.transpose_streamPro
        else:
            transpose_stream = self.transpose_stream

        self.stream_train = DataStream(dataset=dataset_train,
                                       iteration_scheme=ShuffledScheme(
                                           examples=dataset_train.num_examples,
                                           batch_size=self.batch_size))
        self.stream_train = Padding(self.stream_train, mask_sources=['x'])
        self.stream_train = Mapping(self.stream_train, transpose_stream)

        self.stream_valid = DataStream(dataset=dataset_valid,
                                       iteration_scheme=ShuffledScheme(
                                           examples=dataset_valid.num_examples,
                                           batch_size=self.batch_size))
        self.stream_valid = Padding(self.stream_valid, mask_sources=['x'])
        self.stream_valid = Mapping(self.stream_valid, transpose_stream)
Example #5
0
 def test_mask_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 OrderedDict([('features', [[1], [2, 3]]),
                              ('targets', [[4, 5, 6], [7]])]))),
         ConstantScheme(2)),
         mask_sources=('features',))
     assert_equal(len(next(transformer.get_epoch_iterator())), 3)
Example #6
0
 def test_mask_dtype(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)),
         mask_dtype='uint8')
     assert_equal(
         str(next(transformer.get_epoch_iterator())[1].dtype), 'uint8')
Example #7
0
 def test_mask_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 OrderedDict([('features', [[1], [2, 3]]),
                              ('targets', [[4, 5, 6], [7]])]))),
         ConstantScheme(2)),
         mask_sources=('features',))
     assert_equal(len(next(transformer.get_epoch_iterator())), 3)
Example #8
0
 def test_1d_sequences(self):
     stream = Batch(
         DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])),
         ConstantScheme(2))
     transformer = Padding(stream)
     assert_equal(transformer.sources, ("data", "data_mask"))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.array([[1, 0], [2, 3]]),
                    numpy.array([[1, 0], [1, 1]])),
                   (numpy.array([[0, 0, 0], [4, 5, 6]]),
                    numpy.array([[0, 0, 0], [1, 1, 1]])),
                   (numpy.array([[7]]), numpy.array([[1]]))])
 def test_1d_sequences(self):
     stream = Batch(
         DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])),
         ConstantScheme(2))
     transformer = Padding(stream)
     assert_equal(transformer.sources, ("data", "data_mask"))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.array([[1, 0], [2, 3]]),
                    numpy.array([[1, 0], [1, 1]])),
                   (numpy.array([[0, 0, 0], [4, 5, 6]]),
                    numpy.array([[0, 0, 0], [1, 1, 1]])),
                   (numpy.array([[7]]), numpy.array([[1]]))])
Example #10
0
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(
        os.path.join(path,
                     ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(
        os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(
        os.path.join(
            path,
            ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size,
                                                   num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Example #11
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Example #12
0
    def candidate_stream(self, n_candidates, sortmap=True):
        candidate_stream = DataStream(self.train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          self.train_dataset.num_examples))
        if not data.tvt:
            candidate_stream = transformers.TaxiExcludeTrips(
                candidate_stream, self.valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)

        if not data.tvt:
            candidate_stream = transformers.add_destination(candidate_stream)

        if sortmap:
            candidate_stream = transformers.balanced_batch(
                candidate_stream,
                key='latitude',
                batch_size=n_candidates,
                batch_sort_size=self.config.batch_sort_size)
        else:
            candidate_stream = Batch(
                candidate_stream,
                iteration_scheme=ConstantScheme(n_candidates))

        candidate_stream = Padding(candidate_stream,
                                   mask_sources=['latitude', 'longitude'])

        return candidate_stream
Example #13
0
def setup_datastream(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    print('sources')
    print(stream.sources)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    print('sources2')
    print(stream.sources)

    return ds, stream
Example #14
0
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False):
    # load frame-wise dataset
    timit_dataset = FramewiseTimit(which_set=which_set,
                                   path=path,
                                   local_copy=local_copy)

    # set shuffle range
    shuffle_rng = numpy.random.RandomState(123)

    # set iterator scheme
    iterator_scheme = SequentialShuffledScheme(
        num_examples=timit_dataset.num_examples,
        batch_size=batch_size,
        rng=shuffle_rng)

    # base data stream
    base_stream = DataStream(dataset=timit_dataset,
                             iteration_scheme=iterator_scheme)

    # reshape data stream data_source, shape_source
    reshape_stream = Reshape(data_source='features',
                             shape_source='features_shapes',
                             data_stream=base_stream,
                             iteration_scheme=iterator_scheme)

    # sort data stream
    sort_stream = Mapping(data_stream=reshape_stream,
                          mapping=SortMapping(key=lambda x: x[0].shape[0]))

    # padding data stream
    padded_stream = Padding(data_stream=sort_stream)

    return padded_stream
Example #15
0
 def test_value_error_on_request(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert_raises(ValueError, transformer.get_data, [0, 1])
Example #16
0
    def valid(self, req_vars):
        prefix_stream = DataStream(self.valid_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.valid_dataset.num_examples))

        #prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)

        prefix_stream = transformers.balanced_batch(
            prefix_stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)

        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.valid_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)

        return stream
Example #17
0
    def train(self, req_vars):
        valid = TaxiDataset(self.config.valid_set,
                            'valid.hdf5',
                            sources=('trip_id', ))
        valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]

        stream = TaxiDataset('train')
        stream = DataStream(stream,
                            iteration_scheme=ShuffledExampleScheme(
                                stream.num_examples))
        stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)
        stream = transformers.TaxiExcludeEmptyTrips(stream)
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.add_destination(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = transformers.balanced_batch(
            stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        return stream
Example #18
0
    def test(self, req_vars):
        prefix_stream = DataStream(self.test_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.test_dataset.num_examples))

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        if not data.tvt:
            prefix_stream = transformers.taxi_remove_test_only_clients(
                prefix_stream)

        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))
        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.test_candidate_size, False)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)

        return stream
def setup_squad_ranker_datastream(path,
                                  vocab_file,
                                  config,
                                  example_count=1836975):
    ds = SQuADRankerDataset(path, vocab_file)
    it = ShuffledExampleScheme(examples=example_count)
    stream = DataStream(ds, iteration_scheme=it)

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('question'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'question', 'answer', 'better', 'worse', 'b_left',
                         'b_right', 'w_left', 'w_right'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #20
0
    def train(self, req_vars):
        prefix_stream = DataStream(self.train_dataset,
                                   iteration_scheme=ShuffledExampleScheme(
                                       self.train_dataset.num_examples))

        if not data.tvt:
            prefix_stream = transformers.TaxiExcludeTrips(
                prefix_stream, self.valid_trips_ids)
        prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)
        prefix_stream = transformers.TaxiGenerateSplits(
            prefix_stream, max_splits=self.config.max_splits)

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)

        prefix_stream = transformers.balanced_batch(
            prefix_stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)

        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.train_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)
        return stream
Example #21
0
    def get_stream(self, part, batch_size, seed=None, raw_text=False):
        d = self.get_dataset(part)
        print("Dataset with {} examples".format(d.num_examples))
        it = ShuffledExampleScheme(d.num_examples,
                                   rng=numpy.random.RandomState(seed))
        stream = DataStream(d, iteration_scheme=it)
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        if self._retrieval:
            stream = FixedMapping(
                stream,
                functools.partial(retrieve_and_pad_snli, self._retrieval),
                add_sources=("defs", "def_mask", "sentence1_def_map",
                             "sentence2_def_map")
            )  # This is because there is bug in Fuel :( Cannot concatenate tuple and list

        if not raw_text:
            stream = SourcewiseMapping(stream,
                                       functools.partial(digitize, self.vocab),
                                       which_sources=('sentence1',
                                                      'sentence2'))

        stream = Padding(
            stream,
            mask_sources=('sentence1',
                          'sentence2'))  # Increases amount of outputs by x2

        return stream
Example #22
0
def get_datastream(path,
                   which_set,
                   batch_size=1,
                   norm_path=None,
                   use_ivectors=False,
                   truncate_ivectors=False,
                   ivector_dim=100,
                   shuffled=True):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    if shuffled:
        iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                         examples=wsj_dataset.num_examples)
    else:
        iterator_scheme = SequentialScheme(batch_size=batch_size,
                                           examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    if norm_path:
        data_mean_std = numpy.load(norm_path)
        base_stream = Normalize(data_stream=base_stream,
                                means=data_mean_std['mean'],
                                stds=data_mean_std['std'])

    if use_ivectors:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'ivectors', 'targets'])
        if truncate_ivectors:
            fs = TruncateTransformer(fs, 'ivectors', ivector_dim)
        # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features')
    else:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'targets'])
    return Padding(fs)
def setup_squad_datastream(path, vocab_file, config):
    ds = SQuADDataset(path, vocab_file)
    it = SQuADIterator(path)
    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<DUMMY>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'context', 'question', 'answer', 'ans_indices',
                         'ans_boundaries'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #24
0
File: data.py Project: tombosc/cpae
    def get_stream(self,
                   part,
                   batch_size=None,
                   max_length=None,
                   seed=None,
                   remove_keys=False,
                   add_bos_=True,
                   remove_n_identical_keys=True):
        dataset = self.get_dataset(part, max_length)
        if self._layout == 'lambada' and part == 'train':
            stream = DataStream(dataset,
                                iteration_scheme=RandomSpanScheme(
                                    dataset.num_examples, max_length, seed))
            stream = Mapping(stream, listify)
        else:
            stream = dataset.get_example_stream()

        if add_bos_:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           add_bos, Vocabulary.BOS),
                                       which_sources=('words'))
        if max_length != None:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           cut_if_too_long, max_length),
                                       which_sources=('words'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('words'))
        stream = SourcewiseMapping(stream,
                                   word_to_singleton_list,
                                   which_sources=('keys'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('keys'))
        stream = Flatten(stream, which_sources=('keys'))

        if self._layout == 'dict':
            if remove_keys:
                stream = FilterSources(
                    stream,
                    [source for source in stream.sources if source != 'keys'])
            if remove_n_identical_keys:
                print "remove identical keys"
                stream = FilterSources(stream, [
                    source for source in stream.sources
                    if source != 'n_identical_keys'
                ])
        if not batch_size:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        stream = Padding(stream, mask_sources=('words'))
        #stream = Flatten(stream, which_sources=('n_identical_keys'))

        #if self._layout == 'dict':
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'keys_mask'])
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'n_identical_keys_mask'])
        return stream
Example #25
0
def get_test_stream(sfiles, svocab_dict): 
	dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	stream = Merge([dataset.get_example_stream(),], ('source', ))
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(10))
	stream = Padding(stream)
	return stream
Example #26
0
def get_feat_stream(path, which_set='test_eval92', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['features'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
Example #27
0
def get_datastream(path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
Example #28
0
    def replaceTestData(self, testNodes, maxNeighbors=1000, maskNames=['x']):
        if self.batchesInferences:
            batch_size = self.batch_size
        else:
            batch_size = 1

        testing, testIDs = encode_data_VarLen(self.G,
                                              testNodes,
                                              self.attrKey,
                                              maxNeighbors,
                                              useActualLabs=self.useActualLabs,
                                              useInputX2=self.useInputX2,
                                              onlyLabs=self.onlyLabs,
                                              lastH=self.lastHH,
                                              nodeIDs=True)
        dataset_test = IndexableDataset(testing)
        self.stream_test = DataStream(dataset=dataset_test,
                                      iteration_scheme=SequentialScheme(
                                          examples=dataset_test.num_examples,
                                          batch_size=batch_size))
        #add masks, have to do individually to avoid all dimensions must be equal error
        #write own padding transformer, their's sucks ...
        self.stream_test = Padding(self.stream_test, mask_sources=maskNames)
        #transpose them for rnn input
        self.stream_test = Mapping(self.stream_test, self.transpose_streamTest)
        self.num_examples_test = dataset_test.num_examples

        #replace shareddata with test_all data
        self.test_all, names = self.iterateShared(self.stream_test,
                                                  makeShared=False,
                                                  name="test")

        #if we are doing test in batches
        if self.batchesInferences:
            for key in self.test_all:
                totalTestBatches = len(self.test_all[key])
                if key != 'nodeID':
                    for i in range(0, totalTestBatches):
                        #if test data has more batches, we add more to shared data list
                        #else we just reset
                        if i >= self.totalBatches:
                            newKey = key + '_myinput'
                            self.sharedData[key].append(
                                shared(self.test_all[key][i],
                                       name=self.sharedName + '_' + newKey +
                                       '_test_' + str(i)))
                        else:
                            self.sharedData[key][i].set_value(
                                self.test_all[key][i], borrow=True)

                    self.sharedBatch[key].set_value(
                        self.sharedData[key][0].get_value(borrow=True),
                        borrow=True)

            self.stream_test_int = IntStream(0, totalTestBatches, 1,
                                             'int_stream')
Example #29
0
def setup_datastream(batch_size, **kwargs):
    ds = ToyDataset(**kwargs)
    stream = DataStream(ds,
                        iteration_scheme=SequentialExampleScheme(
                            kwargs['nb_examples']))

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Example #30
0
    def valid(self, req_vars):
        stream = TaxiStream(self.config.valid_set, 'valid.hdf5')
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.add_destination(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        return stream
Example #31
0
def _liacl_data_stream(dataset,
                       rel2index,
                       batch_size,
                       word2index,
                       target='negative_sampling',
                       name="",
                       k=3,
                       shuffle=False,
                       neg_sample_kwargs={}):
    batches_per_epoch = int(np.ceil(dataset.num_examples / float(batch_size)))
    if shuffle:
        iteration_scheme = ShuffledScheme(dataset.num_examples, batch_size)
    else:
        iteration_scheme = SequentialScheme(dataset.num_examples, batch_size)
    data_stream = DataStream(dataset, iteration_scheme=iteration_scheme)
    data_stream = NumberizeWords(data_stream,
                                 word2index,
                                 default=word2index[UNKNOWN_TOKEN],
                                 which_sources=('head', 'tail'))
    data_stream = NumberizeWords(data_stream, rel2index, which_sources=('rel'))

    if target == "score":
        data_stream = Rename(data_stream, {'score': 'target'})
    else:
        data_stream = FilterSources(data_stream,
                                    sources=('head', 'tail', 'rel'))

    data_stream = Padding(data_stream,
                          mask_sources=('head, tail'),
                          mask_dtype=np.float32)

    if target == 'negative_sampling':
        logger.info('target for data stream ' + str(name) +
                    ' is negative sampling')
        data_stream = NegativeSampling(data_stream, k=k)
    elif target == 'filtered_negative_sampling':
        logger.info('target for data stream ' + str(name) +
                    ' is filtered negative sampling')
        data_stream = FilteredNegativeSampling(data_stream,
                                               k=k,
                                               **neg_sample_kwargs)
    elif target == 'score':
        logger.info('target for data stream ' + str(name) + ' is score')
    else:
        raise NotImplementedError(
            'target ', target,
            ' must be one of "score" or "negative_sampling"')

    data_stream = MergeSource(data_stream,
                              merge_sources=('head', 'tail', 'head_mask',
                                             'tail_mask', 'rel'),
                              merge_name='input')

    return data_stream, batches_per_epoch
Example #32
0
    def test(self, req_vars):
        stream = TaxiStream('test')
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.taxi_remove_test_only_clients(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        return stream
Example #33
0
def construct_stream(dataset, rng, pool_size, maximum_frames, window_features,
                     **kwargs):
    """Construct data stream.

    Parameters:
    -----------
    dataset : Dataset
        Dataset to use.
    rng : numpy.random.RandomState
        Random number generator.
    pool_size : int
        Pool size for TIMIT dataset.
    maximum_frames : int
        Maximum frames for TIMIT datset.
    subsample : bool, optional
        Subsample features.
    pretrain_alignment : bool, optional
        Use phoneme alignment for pretraining.
    uniform_alignment : bool, optional
        Use uniform alignment for pretraining.

    """
    kwargs.setdefault('subsample', False)
    kwargs.setdefault('pretrain_alignment', False)
    kwargs.setdefault('uniform_alignment', False)
    stream = DataStream(dataset,
                        iteration_scheme=SequentialShuffledScheme(
                            dataset.num_examples, pool_size, rng))
    if kwargs['pretrain_alignment'] and kwargs['uniform_alignment']:
        stream = AddUniformAlignmentMask(stream)
    stream = Reshape('features', 'features_shapes', data_stream=stream)
    means, stds = dataset.get_normalization_factors()
    stream = Normalize(stream, means, stds)
    if not window_features == 1:
        stream = WindowFeatures(stream, 'features', window_features)
    if kwargs['pretrain_alignment']:
        stream = Reshape('alignments', 'alignments_shapes', data_stream=stream)
    stream = Mapping(stream, SortMapping(key=key))
    stream = MaximumFrameCache(max_frames=maximum_frames,
                               data_stream=stream,
                               rng=rng)
    stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes'])
    if kwargs['pretrain_alignment']:
        stream = AlignmentPadding(stream, 'alignments')
        stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0),
                                    (2, 1, 0)])
    else:
        stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)])

    stream = ForceFloatX(stream)
    if kwargs['subsample']:
        stream = Subsample(stream, 'features', 5)
        stream = Subsample(stream, 'features_mask', 5)
    return stream
Example #34
0
 def test_2d_sequences(self):
     stream = Batch(
         DataStream(
             IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])),
         ConstantScheme(2))
     it = Padding(stream).get_epoch_iterator()
     data, mask = next(it)
     assert data.shape == (2, 3, 4)
     assert (data[0, :, :] == 1).all()
     assert (data[1, :2, :] == 2).all()
     assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()
def test_padding():
    # 1-D sequences
    stream = Batch(
        DataStream(
            IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])),
        ConstantScheme(2))
    mask_stream = Padding(stream)
    assert mask_stream.sources == ("data", "data_mask")
    it = mask_stream.get_epoch_iterator()
    data, mask = next(it)
    assert (data == numpy.array([[1, 0], [2, 3]])).all()
    assert (mask == numpy.array([[1, 0], [1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all()
    assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[7]])).all()
    assert (mask == numpy.array([[1]])).all()

    # 2D sequences
    stream2 = Batch(
        DataStream(
            IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])),
        ConstantScheme(2))
    it = Padding(stream2).get_epoch_iterator()
    data, mask = next(it)
    assert data.shape == (2, 3, 4)
    assert (data[0, :, :] == 1).all()
    assert (data[1, :2, :] == 2).all()
    assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()

    # 2 sources
    stream3 = Padding(Batch(
        DataStream(
            IterableDataset(
                dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
        ConstantScheme(2)))
    assert len(next(stream3.get_epoch_iterator())) == 4
Example #36
0
batch_size = 20
frame_size = 3
k = 20
target_size = frame_size * k

hidden_size_recurrent = 400
readout_size = 6 * k + 1

lr = 3e-4

dataset = Handwriting(("train",))
data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme(dataset.num_examples, batch_size))

# data_stream = FilterSources(data_stream,
#                          sources = ('features',))
data_stream = Padding(data_stream)
data_stream = Mapping(data_stream, _transpose)
# data_stream = ForceFloatX(data_stream)

dataset = Handwriting(("valid",))
valid_stream = DataStream.default_stream(
    dataset, iteration_scheme=SequentialScheme(dataset.num_examples, 10 * batch_size)
)
valid_stream = Padding(valid_stream)
valid_stream = Mapping(valid_stream, _transpose)

x_tr = next(data_stream.get_epoch_iterator())

x = tensor.tensor3("features")
x_mask = tensor.matrix("features_mask")