Example #1
0
def get_data_stream(iterable):
    dataset = IterableDataset({'numbers': iterable})
    data_stream = Mapping(dataset.get_example_stream(),
                          _data_sqrt,
                          add_sources=('roots', ))
    data_stream = Mapping(data_stream, _array_tuple)
    return Batch(data_stream, ConstantScheme(20))
Example #2
0
def open_stream(which_sets=('train', ), port=5557, num_examples=None):

    dataset = Blizzard(which_sets=which_sets)

    if num_examples == None:
        num_examples = dataset.num_examples

    data_stream = DataStream.default_stream(dataset,
                                            iteration_scheme=SequentialScheme(
                                                num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream,
                                scale=1 / data_std,
                                shift=-data_mean / data_std)
    data_stream = Mapping(data_stream,
                          _downsample_and_upsample,
                          add_sources=('upsampled', ))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream,
                          _get_residual,
                          add_sources=('residual', ))
    data_stream = FilterSources(data_stream,
                                sources=(
                                    'upsampled',
                                    'residual',
                                ))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    data_stream = ForceFloatX(data_stream)

    start_server(data_stream, port=port)
Example #3
0
    def indexData(self):
        labCounts = graph_helper.getLabelCounts(
            self.G, self.trainNodes + self.validationNodes)
        trainXY, trainIDs = encode_data_VarLen(
            self.G,
            self.trainNodes,
            self.attrKey,
            self.maxNeighbors,
            usePrevWeights=self.usePrevWeights,
            useActualLabs=self.useActualLabs,
            onlyLabs=self.onlyLabs,
            useInputX2=self.useInputX2,
            labCounts=labCounts,
            dataAug=self.dataAug,
            pageRankOrder=self.pageRankOrder,
            usePro=self.usePro,
            lastH=self.lastHH,
            nodeIDs=True)
        validationXY, testIDs = encode_data_VarLen(
            self.G,
            self.validationNodes,
            self.attrKey,
            self.maxNeighbors,
            labCounts=labCounts,
            usePrevWeights=self.usePrevWeights,
            useActualLabs=self.useActualLabs,
            onlyLabs=self.onlyLabs,
            useInputX2=self.useInputX2,
            pageRankOrder=self.pageRankOrder,
            usePro=self.usePro,
            lastH=self.lastHH,
            nodeIDs=True)
        self.input_dimx1 = trainXY['x'][0].shape[1]
        if 'x2' in trainXY:
            self.input_dimx2 = trainXY['x2'].shape[1]

        dataset_train = IndexableDataset(trainXY)
        dataset_valid = IndexableDataset(validationXY)
        self.num_examples_train = dataset_train.num_examples
        self.num_examples_valid = dataset_valid.num_examples
        if self.usePro:
            transpose_stream = self.transpose_streamPro
        else:
            transpose_stream = self.transpose_stream

        self.stream_train = DataStream(dataset=dataset_train,
                                       iteration_scheme=ShuffledScheme(
                                           examples=dataset_train.num_examples,
                                           batch_size=self.batch_size))
        self.stream_train = Padding(self.stream_train, mask_sources=['x'])
        self.stream_train = Mapping(self.stream_train, transpose_stream)

        self.stream_valid = DataStream(dataset=dataset_valid,
                                       iteration_scheme=ShuffledScheme(
                                           examples=dataset_valid.num_examples,
                                           batch_size=self.batch_size))
        self.stream_valid = Padding(self.stream_valid, mask_sources=['x'])
        self.stream_valid = Mapping(self.stream_valid, transpose_stream)
Example #4
0
def get_data_stream(iterable):
    """Returns a 'fuel.Batch' datastream of
    [x~input~numbers, y~targets~roots], with each iteration returning a
    batch of 20 training examples
    """
    dataset = IterableDataset({'numbers': iterable})
    data_stream = Mapping(dataset.get_example_stream(),
                          _data_sqrt,
                          add_sources=('roots', ))
    data_stream = Mapping(data_stream, _array_tuple)
    return Batch(data_stream, ConstantScheme(20))
Example #5
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
def test_mapping():
    data = [1, 2, 3]
    data_doubled = [2, 4, 6]
    stream = DataStream(IterableDataset(data))
    wrapper1 = Mapping(stream, lambda d: (2 * d[0], ))
    assert list(wrapper1.get_epoch_iterator()) == list(zip(data_doubled))
    wrapper2 = Mapping(stream,
                       lambda d: (2 * d[0], ),
                       add_sources=("doubled", ))
    assert wrapper2.sources == ("data", "doubled")
    assert list(wrapper2.get_epoch_iterator()) == list(zip(data, data_doubled))
def test_mapping_sort():
    data = [[1, 2, 3], [2, 3, 1], [3, 2, 1]]
    data_sorted = [[1, 2, 3]] * 3
    data_sorted_rev = [[3, 2, 1]] * 3
    stream = DataStream(IterableDataset(data))
    wrapper1 = Mapping(stream, SortMapping(operator.itemgetter(0)))
    assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted))
    wrapper2 = Mapping(stream, SortMapping(lambda x: -x[0]))
    assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev))
    wrapper3 = Mapping(stream, SortMapping(operator.itemgetter(0),
                                           reverse=True))
    assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
Example #8
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Example #9
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
def setup_squad_datastream(path, vocab_file, config):
    ds = SQuADDataset(path, vocab_file)
    it = SQuADIterator(path)
    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<DUMMY>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'context', 'question', 'answer', 'ans_indices',
                         'ans_boundaries'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #11
0
    def test_mapping_accepts_list_or_dict(self):
        def mapping(d):
            return [2 * i for i in d[0]],

        stream = DataStream(IterableDataset(self.data))
        assert_raises(ValueError,
                      lambda: Mapping(stream, mapping, mapping_accepts=int))
Example #12
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Example #13
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000,
                  unk_id=0, eos_id=1, bos_id=2, train_noise=0,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise)
    trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0)

    # Merge them to get a source, target pair
    stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_not_too_long(seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    return PaddingWithEOS(stream, [eos_id, eos_id])
Example #14
0
    def train(self):
        print "Loading data"
        datafile = self.get_datafile()
        nbexamples = datafile.num_examples
        nbexamples -= nbexamples % (self.sequence_dim * self.time_dim)

        train_stream = ReshapeTransformer(
            DataStream(dataset=datafile,
                       iteration_scheme=ShuffledBatchChunkScheme(
                           nbexamples, self.sequence_dim * self.time_dim)),
            self.sequence_dim, self.time_dim)

        if self.image_size is not None:
            train_stream = Mapping(train_stream,
                                   spec_mapping,
                                   add_sources=['spectrogram'])

        print "Building Theano Graph"
        algorithm, self.fprop = self.build_theano_functions()

        main_loop = MainLoop(algorithm=algorithm,
                             data_stream=train_stream,
                             model=self.model,
                             extensions=[
                                 FinishAfter(after_n_epochs=EPOCHS),
                                 TrainingDataMonitoring(
                                     [aggregation.mean(self.model.outputs[0])],
                                     prefix="train",
                                     after_epoch=True),
                                 Printing(),
                                 SaveParams(EXP_PATH + NAME, after_epoch=True)
                             ])

        main_loop.run()
Example #15
0
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(
        os.path.join(path,
                     ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(
        os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(
        os.path.join(
            path,
            ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size,
                                                   num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
def setup_squad_ranker_datastream(path,
                                  vocab_file,
                                  config,
                                  example_count=1836975):
    ds = SQuADRankerDataset(path, vocab_file)
    it = ShuffledExampleScheme(examples=example_count)
    stream = DataStream(ds, iteration_scheme=it)

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('question'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'question', 'answer', 'better', 'worse', 'b_left',
                         'b_right', 'w_left', 'w_right'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #17
0
def setup_datastream(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    print('sources')
    print(stream.sources)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    print('sources2')
    print(stream.sources)

    return ds, stream
Example #18
0
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False):
    # load frame-wise dataset
    timit_dataset = FramewiseTimit(which_set=which_set,
                                   path=path,
                                   local_copy=local_copy)

    # set shuffle range
    shuffle_rng = numpy.random.RandomState(123)

    # set iterator scheme
    iterator_scheme = SequentialShuffledScheme(
        num_examples=timit_dataset.num_examples,
        batch_size=batch_size,
        rng=shuffle_rng)

    # base data stream
    base_stream = DataStream(dataset=timit_dataset,
                             iteration_scheme=iterator_scheme)

    # reshape data stream data_source, shape_source
    reshape_stream = Reshape(data_source='features',
                             shape_source='features_shapes',
                             data_stream=base_stream,
                             iteration_scheme=iterator_scheme)

    # sort data stream
    sort_stream = Mapping(data_stream=reshape_stream,
                          mapping=SortMapping(key=lambda x: x[0].shape[0]))

    # padding data stream
    padded_stream = Padding(data_stream=sort_stream)

    return padded_stream
Example #19
0
    def train(self, req_vars):
        stream = TaxiDataset('train', data.traintest_ds)

        if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples))

        if not data.tvt:
            valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',))
            valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]
            stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)

        stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits)

        if hasattr(self.config, 'shuffle_batch_size'):
            stream = transformers.Batch(stream, iteration_scheme=ConstantScheme(self.config.shuffle_batch_size))
            stream = Mapping(stream, SortMapping(key=UniformGenerator()))
            stream = Unpack(stream)

        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.Select(stream, tuple(req_vars))
        
        stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size))

        stream = MultiProcessing(stream)

        return stream
def add_destination(stream):
    fun = _add_destination_helper(stream.sources.index('latitude'),
                                  stream.sources.index('longitude'))
    return Mapping(stream,
                   fun,
                   add_sources=('destination_latitude',
                                'destination_longitude'))
Example #21
0
 def test_add_sources(self):
     stream = DataStream(IterableDataset(self.data))
     transformer = Mapping(stream, lambda d: ([2 * i for i in d[0]],),
                           add_sources=('doubled',))
     assert_equal(transformer.sources, ('data', 'doubled'))
     assert_equal(list(transformer.get_epoch_iterator()),
                  list(zip(self.data, [[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
Example #22
0
def get_log_prob_stream(cg, config):
    eid, did = p_(cg)
    dataset = config['log_prob_sets'][cg]

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocab = cPickle.load(open(config['src_vocabs'][eid]))
    src_vocab['<S>'] = 0
    src_vocab['</S>'] = config['src_eos_idxs'][eid]
    src_vocab['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocab = cPickle.load(open(config['trg_vocabs'][did]))
    trg_vocab['<S>'] = 0
    trg_vocab['</S>'] = config['trg_eos_idxs'][did]
    trg_vocab['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    logger.info('Building logprob stream for cg:[{}]'.format(cg))
    src_dataset = TextFile([dataset[0]], src_vocab, None)
    trg_dataset = TextFile([dataset[1]], trg_vocab, None)
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                    trg_vocab_size=config['trg_vocab_sizes'][did],
                    unk_id=config['unk_id']))
    bs = 100
    if 'log_prob_bs' in config:
        if isinstance(config['log_prob_bs'], dict):
            bs = config['log_prob_bs'][cg]
        else:
            bs = config['log_prob_bs']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(bs,
                                                   num_examples=get_num_lines(
                                                       dataset[0])))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))

    return masked_stream
Example #23
0
File: data.py Project: tombosc/cpae
    def get_stream(self,
                   part,
                   batch_size=None,
                   max_length=None,
                   seed=None,
                   remove_keys=False,
                   add_bos_=True,
                   remove_n_identical_keys=True):
        dataset = self.get_dataset(part, max_length)
        if self._layout == 'lambada' and part == 'train':
            stream = DataStream(dataset,
                                iteration_scheme=RandomSpanScheme(
                                    dataset.num_examples, max_length, seed))
            stream = Mapping(stream, listify)
        else:
            stream = dataset.get_example_stream()

        if add_bos_:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           add_bos, Vocabulary.BOS),
                                       which_sources=('words'))
        if max_length != None:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           cut_if_too_long, max_length),
                                       which_sources=('words'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('words'))
        stream = SourcewiseMapping(stream,
                                   word_to_singleton_list,
                                   which_sources=('keys'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('keys'))
        stream = Flatten(stream, which_sources=('keys'))

        if self._layout == 'dict':
            if remove_keys:
                stream = FilterSources(
                    stream,
                    [source for source in stream.sources if source != 'keys'])
            if remove_n_identical_keys:
                print "remove identical keys"
                stream = FilterSources(stream, [
                    source for source in stream.sources
                    if source != 'n_identical_keys'
                ])
        if not batch_size:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        stream = Padding(stream, mask_sources=('words'))
        #stream = Flatten(stream, which_sources=('n_identical_keys'))

        #if self._layout == 'dict':
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'keys_mask'])
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'n_identical_keys_mask'])
        return stream
def balanced_batch(stream, key, batch_size, batch_sort_size):
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   batch_sort_size))
    comparison = _balanced_batch_helper(stream.sources.index(key))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    return Batch(stream, iteration_scheme=ConstantScheme(batch_size))
def taxi_add_first_last_len(stream, k):
    fun = _taxi_add_first_last_len_helper(k, stream.sources.index('latitude'),
                                          stream.sources.index('longitude'))
    return Mapping(stream,
                   fun,
                   add_sources=('first_k_latitude', 'first_k_longitude',
                                'last_k_latitude', 'last_k_longitude',
                                'input_time'))
Example #26
0
def define_stream(which_sets=('train',),
                initial_scale=1,
                scale=0.5,
                batch_size=64,
                seq_length=64,
                frame_size=128,
                tbptt_flag = True,
                num_examples=None):

    def _segment_axis(data):
        # Defined inside so that frame_size is available
        x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var])
                   for var in data])
        return x

    scale = float(scale)

    dataset = Blizzard(which_sets=which_sets)

    if num_examples is None:
        num_examples = batch_size*(dataset.num_examples/batch_size)

    data_stream = DataStream.default_stream(
            dataset,
            iteration_scheme=SequentialScheme(num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream,
                                scale=1/data_std,
                                shift=-data_mean/float(data_std))

    # Original sampling rate
    data_stream = Resample(data_stream, scale=initial_scale)
    data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',))

    # data_stream = Mapping(data_stream, _downsample_and_upsample,
    #                       add_sources=('upsampled',))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream, _get_residual,
                          add_sources=('residual',))
    data_stream = FilterSources(data_stream,
                                sources=('upsampled', 'residual',))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    return data_stream
Example #27
0
 def test_mapping_sort_multisource(self):
     data = OrderedDict([('x', self.data_x), ('y', self.data_y)])
     data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]),
                    ([1, 2, 3], [4, 5, 6])]
     stream = DataStream(IterableDataset(data))
     transformer = Mapping(stream,
                           mapping=SortMapping(operator.itemgetter(0)))
     assert_equal(list(transformer.get_epoch_iterator()), data_sorted)
Example #28
0
    def test_mapping_dict(self):
        def mapping(d):
            return {'data': [2 * i for i in d['data']]}

        stream = DataStream(IterableDataset(self.data))
        transformer = Mapping(stream, mapping, mapping_accepts=dict)
        assert_equal(list(transformer.get_epoch_iterator()),
                     list(zip([[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
def test_mapping_sort_multisource():
    data = OrderedDict()
    data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]]
    data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]]
    data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]),
                   ([1, 2, 3], [4, 5, 6])]
    stream = DataStream(IterableDataset(data))
    wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0)))
    assert list(wrapper.get_epoch_iterator()) == data_sorted
Example #30
0
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000,
                                 trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream()],
                           ('source', 'target'))

        # now add prefix and suffixes to this stream
        dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)),
                             add_sources=('target_prefix', 'target_suffix'))

        dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream))

        # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
        dev_stream.produces_examples = False
        # flatten the stream back out into (source, target, target_prefix, target_suffix)
        dev_stream = Unpack(dev_stream)

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream