コード例 #1
0
 def test_strictness_2(self):
     stream = DataStream(IterableDataset([1, 2, 3, 4, 5, 6]))
     transformer = Batch(stream, ConstantScheme(2), strictness=2)
     assert_equal(
         list(transformer.get_epoch_iterator()),
         [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5, 6]),)],
     )
コード例 #2
0
 def setUp(self):
     data = range(10)
     self.stream = Batch(
         DataStream(IterableDataset(data)),
         iteration_scheme=ConstantScheme(2))
     data_np = numpy.arange(10)
     self.stream_np = Batch(
         DataStream(IterableDataset(data_np)),
         iteration_scheme=ConstantScheme(2))
コード例 #3
0
ファイル: stream.py プロジェクト: yylong711/TA-Seq2Seq
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
コード例 #4
0
ファイル: train.py プロジェクト: chagge/sgnmt
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
コード例 #5
0
 def setUp(self):
     self.streams = (
         DataStream(IterableDataset(['Hello world!'])),
         DataStream(IterableDataset(['Bonjour le monde!'])))
     self.batch_streams = (
         Batch(DataStream(IterableDataset(['Hello world!', 'Hi!'])),
               iteration_scheme=ConstantScheme(2)),
         Batch(DataStream(IterableDataset(['Bonjour le monde!', 'Salut!'])),
               iteration_scheme=ConstantScheme(2)))
     self.transformer = Merge(
         self.streams, ('english', 'french'))
     self.batch_transformer = Merge(
         self.batch_streams, ('english', 'french'))
コード例 #6
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
コード例 #7
0
ファイル: train.py プロジェクト: chagge/sgnmt
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
コード例 #8
0
ファイル: data.py プロジェクト: vyraun/rnn_reader
def create_data_generator(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    def gen():

        if not config.concat_ctx_and_question:
            for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                 candidates, candidates_mask) in stream.get_epoch_iterator():
                seq_cont_mask = seq_cont_mask.astype('float32')
                seq_quest_mask = seq_quest_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                       candidates, candidates_mask)
        else:

            for (seq, seq_mask, tg, candidates, candidates_mask) \
                    in stream.get_epoch_iterator():
                seq_mask = seq_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq, seq_mask, tg, candidates, candidates_mask)

    return gen
コード例 #9
0
ファイル: data.py プロジェクト: arianhosseini/point-and-sort
def setup_sorter_datastream(path, config):
    ds = SorterDataset(path)
    it = ShuffledExampleScheme(examples=config.example_count)
    stream = DataStream(ds, iteration_scheme=it)
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('unsorted'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['answer', 'unsorted'],
                     mask_dtype='int32')
    return ds, stream
コード例 #10
0
    def train(self, req_vars):
        valid = TaxiDataset(self.config.valid_set,
                            'valid.hdf5',
                            sources=('trip_id', ))
        valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]

        stream = TaxiDataset('train')

        if hasattr(
                self.config,
                'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream,
                                iteration_scheme=ShuffledExampleScheme(
                                    stream.num_examples))

        stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)
        stream = transformers.TaxiGenerateSplits(
            stream, max_splits=self.config.max_splits)

        stream = transformers.taxi_add_datetime(stream)
        # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.Select(stream, tuple(req_vars))

        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(self.config.batch_size))

        stream = MultiProcessing(stream)

        return stream
コード例 #11
0
    def valid(self, req_vars):
        stream = TaxiStream(self.config.valid_set, 'valid.hdf5')

        stream = transformers.taxi_add_datetime(stream)
        # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.Select(stream, tuple(req_vars))
        return Batch(stream, iteration_scheme=ConstantScheme(1000))
コード例 #12
0
    def get_stream(self, part, batch_size, seed=None, raw_text=False):
        d = self.get_dataset(part)
        print("Dataset with {} examples".format(d.num_examples))
        it = ShuffledExampleScheme(d.num_examples,
                                   rng=numpy.random.RandomState(seed))
        stream = DataStream(d, iteration_scheme=it)
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        if self._retrieval:
            stream = FixedMapping(
                stream,
                functools.partial(retrieve_and_pad_snli, self._retrieval),
                add_sources=("defs", "def_mask", "sentence1_def_map",
                             "sentence2_def_map")
            )  # This is because there is bug in Fuel :( Cannot concatenate tuple and list

        if not raw_text:
            stream = SourcewiseMapping(stream,
                                       functools.partial(digitize, self.vocab),
                                       which_sources=('sentence1',
                                                      'sentence2'))

        stream = Padding(
            stream,
            mask_sources=('sentence1',
                          'sentence2'))  # Increases amount of outputs by x2

        return stream
コード例 #13
0
    def test(self, req_vars):
        prefix_stream = DataStream(self.test_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.test_dataset.num_examples))

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        if not data.tvt:
            prefix_stream = transformers.taxi_remove_test_only_clients(
                prefix_stream)

        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))
        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.test_candidate_size, False)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)

        return stream
コード例 #14
0
 def test_two_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert len(next(transformer.get_epoch_iterator())) == 4
コード例 #15
0
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size,
                      seq_len, **kwargs):
    """Creates the stream which is used for the main loop.
    
    Args:
        src_data (string): Path to the source sentences
        trg_data (string): Path to the target sentences
        src_vocab_size (int): Size of the source vocabulary in the NMT
                              model
        trg_vocab_size (int): Size of the target vocabulary in the NMT
                              model
        seq_len (int): Maximum length of any source or target sentence
    
    Returns:
        ExplicitNext. Alignment data stream which can be iterated
        explicitly
    """
    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))
    s = Batch(s, iteration_scheme=ConstantScheme(1))
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])
    return ExplicitNext(masked_stream)
コード例 #16
0
 def test_adds_batch_to_axis_labels(self):
     stream = DataStream(
         IterableDataset(
             {'features': [1, 2, 3, 4, 5]},
             axis_labels={'features': ('index',)}))
     transformer = Batch(stream, ConstantScheme(2), strictness=0)
     assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
コード例 #17
0
    def candidate_stream(self, n_candidates, sortmap=True):
        candidate_stream = DataStream(self.train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          self.train_dataset.num_examples))
        if not data.tvt:
            candidate_stream = transformers.TaxiExcludeTrips(
                candidate_stream, self.valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)

        if not data.tvt:
            candidate_stream = transformers.add_destination(candidate_stream)

        if sortmap:
            candidate_stream = transformers.balanced_batch(
                candidate_stream,
                key='latitude',
                batch_size=n_candidates,
                batch_sort_size=self.config.batch_sort_size)
        else:
            candidate_stream = Batch(
                candidate_stream,
                iteration_scheme=ConstantScheme(n_candidates))

        candidate_stream = Padding(candidate_stream,
                                   mask_sources=['latitude', 'longitude'])

        return candidate_stream
コード例 #18
0
ファイル: sqrt.py プロジェクト: raphael-forks/blocks
def get_data_stream(iterable):
    dataset = IterableDataset({'numbers': iterable})
    data_stream = Mapping(dataset.get_example_stream(),
                          _data_sqrt,
                          add_sources=('roots', ))
    data_stream = Mapping(data_stream, _array_tuple)
    return Batch(data_stream, ConstantScheme(20))
コード例 #19
0
    def test(self, req_vars):
        prefix_stream = DataStream(self.test_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       self.test_dataset.num_examples))
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)

        if not data.tvt:
            prefix_stream = transformers.taxi_remove_test_only_clients(
                prefix_stream)

        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = self.candidate_stream(
            self.config.test_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
コード例 #20
0
    def train(self, req_vars):
        prefix_stream = DataStream(self.train_dataset,
                                   iteration_scheme=ShuffledExampleScheme(
                                       self.train_dataset.num_examples))

        if not data.tvt:
            prefix_stream = transformers.TaxiExcludeTrips(
                prefix_stream, self.valid_trips_ids)
        prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)
        prefix_stream = transformers.TaxiGenerateSplits(
            prefix_stream, max_splits=self.config.max_splits)
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = self.candidate_stream(
            self.config.train_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
コード例 #21
0
 def test_value_error_on_request(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert_raises(ValueError, transformer.get_data, [0, 1])
コード例 #22
0
def test_cache():
    dataset = IterableDataset(range(100))
    stream = DataStream(dataset)
    batched_stream = Batch(stream, ConstantScheme(11))
    cached_stream = Cache(batched_stream, ConstantScheme(7))
    epoch = cached_stream.get_epoch_iterator()

    # Make sure that cache is filled as expected
    for (features, ), cache_size in zip(epoch,
                                        [4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 0, 4]):
        assert len(cached_stream.cache[0]) == cache_size

    # Make sure that the epoch finishes correctly
    for (features, ) in cached_stream.get_epoch_iterator():
        pass
    assert len(features) == 100 % 7
    assert not cached_stream.cache[0]

    # Ensure that the epoch transition is correct
    cached_stream = Cache(batched_stream, ConstantScheme(7, times=3))
    for _, epoch in zip(range(2), cached_stream.iterate_epochs()):
        cache_sizes = [4, 8, 1]
        for i, (features, ) in enumerate(epoch):
            assert len(cached_stream.cache[0]) == cache_sizes[i]
            assert len(features) == 7
            assert numpy.all(list(range(100))[i * 7:(i + 1) * 7] == features)
        assert i == 2
コード例 #23
0
ファイル: data.py プロジェクト: tombosc/cpae
    def get_stream(self,
                   part,
                   batch_size=None,
                   max_length=None,
                   seed=None,
                   remove_keys=False,
                   add_bos_=True,
                   remove_n_identical_keys=True):
        dataset = self.get_dataset(part, max_length)
        if self._layout == 'lambada' and part == 'train':
            stream = DataStream(dataset,
                                iteration_scheme=RandomSpanScheme(
                                    dataset.num_examples, max_length, seed))
            stream = Mapping(stream, listify)
        else:
            stream = dataset.get_example_stream()

        if add_bos_:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           add_bos, Vocabulary.BOS),
                                       which_sources=('words'))
        if max_length != None:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           cut_if_too_long, max_length),
                                       which_sources=('words'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('words'))
        stream = SourcewiseMapping(stream,
                                   word_to_singleton_list,
                                   which_sources=('keys'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('keys'))
        stream = Flatten(stream, which_sources=('keys'))

        if self._layout == 'dict':
            if remove_keys:
                stream = FilterSources(
                    stream,
                    [source for source in stream.sources if source != 'keys'])
            if remove_n_identical_keys:
                print "remove identical keys"
                stream = FilterSources(stream, [
                    source for source in stream.sources
                    if source != 'n_identical_keys'
                ])
        if not batch_size:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        stream = Padding(stream, mask_sources=('words'))
        #stream = Flatten(stream, which_sources=('n_identical_keys'))

        #if self._layout == 'dict':
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'keys_mask'])
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'n_identical_keys_mask'])
        return stream
コード例 #24
0
def get_test_stream(sfiles, svocab_dict): 
	dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	stream = Merge([dataset.get_example_stream(),], ('source', ))
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(10))
	stream = Padding(stream)
	return stream
コード例 #25
0
    def test(self, req_vars):
        stream = TaxiStream('test')

        stream = transformers.taxi_add_datetime(stream)
        # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.taxi_remove_test_only_clients(stream)

        return Batch(stream, iteration_scheme=ConstantScheme(1))
コード例 #26
0
def test_unpack_transformer():
    data = range(10)
    stream = DataStream(IterableDataset(data))
    stream = Batch(stream, iteration_scheme=ConstantScheme(2))
    stream = Unpack(stream)
    epoch = stream.get_epoch_iterator()
    for i, v in enumerate(epoch):
        assert numpy.shape(v)[0] == 1
        assert v[0] == i
コード例 #27
0
 def test_mask_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 OrderedDict([('features', [[1], [2, 3]]),
                              ('targets', [[4, 5, 6], [7]])]))),
         ConstantScheme(2)),
         mask_sources=('features',))
     assert_equal(len(next(transformer.get_epoch_iterator())), 3)
コード例 #28
0
 def test_mask_dtype(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)),
         mask_dtype='uint8')
     assert_equal(
         str(next(transformer.get_epoch_iterator())[1].dtype), 'uint8')
コード例 #29
0
def load_parallel_data(src_file,
                       tgt_file,
                       batch_size,
                       sort_k_batches,
                       dictionary,
                       training=False):
    def preproc(s):
        s = s.replace('``', '"')
        s = s.replace('\'\'', '"')
        return s

    enc_dset = TextFile(files=[src_file],
                        dictionary=dictionary,
                        bos_token=None,
                        eos_token=None,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    dec_dset = TextFile(files=[tgt_file],
                        dictionary=dictionary,
                        bos_token=CHAR_SOS_TOK,
                        eos_token=CHAR_EOS_TOK,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    # NOTE merge encoder and decoder setup together
    stream = Merge(
        [enc_dset.get_example_stream(),
         dec_dset.get_example_stream()], ('source', 'target'))
    if training:
        # filter sequences that are too long
        stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN))
        # batch and read k batches ahead
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(batch_size *
                                                       sort_k_batches))
        # sort all samples in read-ahead batch
        stream = Mapping(stream, SortMapping(lambda x: len(x[1])))
        # turn back into stream
        stream = Unpack(stream)
    # batch again
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    masked_stream = Padding(stream)
    return masked_stream
コード例 #30
0
ファイル: toy_dataset.py プロジェクト: zxsted/CTC-LSTM
def setup_datastream(batch_size, **kwargs):
    ds = ToyDataset(**kwargs)
    stream = DataStream(ds,
                        iteration_scheme=SequentialExampleScheme(
                            kwargs['nb_examples']))

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
コード例 #31
0
def test_batch():
    stream = DataStream(IterableDataset([1, 2, 3, 4, 5]))
    wrapper = Batch(stream, ConstantScheme(2))
    batches = list(wrapper.get_epoch_iterator())
    expected = [(numpy.array([1, 2]),),
                (numpy.array([3, 4]),),
                (numpy.array([5]),)]
    assert len(batches) == len(expected)
    for b, e in zip(batches, expected):
        assert (b[0] == e[0]).all()

    # Check the `strict` flag
    def try_strict(strictness):
        return list(Batch(stream, ConstantScheme(2), strictness=strictness)
                    .get_epoch_iterator())
    assert_raises(ValueError, try_strict, 2)
    assert len(try_strict(1)) == 2
    stream2 = DataStream(IterableDataset([1, 2, 3, 4, 5, 6]))
    assert len(list(Batch(stream2, ConstantScheme(2), strictness=2)
                    .get_epoch_iterator())) == 3
コード例 #32
0
 def test_2d_sequences(self):
     stream = Batch(
         DataStream(
             IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])),
         ConstantScheme(2))
     it = Padding(stream).get_epoch_iterator()
     data, mask = next(it)
     assert data.shape == (2, 3, 4)
     assert (data[0, :, :] == 1).all()
     assert (data[1, :2, :] == 2).all()
     assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()
コード例 #33
0
from fuel.transformers import Mapping, Batch
from fuel.schemes import ConstantScheme
from fuel.transformers import Flatten

from extensions.plot import Plot
from datasets.addition import AdditionTask

from numpy import swapaxes

def _transpose(data):
    return tuple(swapaxes(array,0,1) if len(array.shape) > 2 else array for array in data)

dataset = AdditionTask(1000)
train_stream = dataset.get_example_stream()
train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(10))
train_stream = Mapping(train_stream, _transpose)

features_test, targets_test = next(train_stream.get_epoch_iterator())

x = tensor.tensor3('features')
y = tensor.matrix('targets')

n_batchs = 1000
h_dim = 2
x_dim = 2

encode = Linear(name='encode',
                input_dim=x_dim,
                output_dim=h_dim)
コード例 #34
0
ファイル: lstm_text.py プロジェクト: AntHar/summerschool2015
def train_model(batch_size=100, n_h=50, n_epochs=40):

    # Load the datasets with Fuel
    dictionary = pkl.load(open(DICT_FILE, 'r'))
    dictionary['~'] = len(dictionary)
    reverse_mapping = dict((j, i) for i, j in dictionary.items())

    print("Loading the data")
    train = TextFile(files=[TRAIN_FILE],
                     dictionary=dictionary,
                     unk_token='~',
                     level='character',
                     preprocess=str.lower,
                     bos_token=None,
                     eos_token=None)

    train_stream = DataStream.default_stream(train)

    # organize data in batches and pad shorter sequences with zeros
    train_stream = Batch(train_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    train_stream = Padding(train_stream)

    # idem dito for the validation text
    val = TextFile(files=[VAL_FILE],
                     dictionary=dictionary,
                     unk_token='~',
                     level='character',
                     preprocess=str.lower,
                     bos_token=None,
                     eos_token=None)

    val_stream = DataStream.default_stream(val)

    # organize data in batches and pad shorter sequences with zeros
    val_stream = Batch(val_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    val_stream = Padding(val_stream)

    print('Building model')

    # Set the random number generator' seeds for consistency
    rng = numpy.random.RandomState(12345)

    x = T.lmatrix('x')
    mask = T.matrix('mask')

    # Construct the LSTM layer
    recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)

    logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
                                      n_in=n_h, n_out=111)

    cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
                                             x[1:],
                                             mask[1:]) / batch_size

    # create a list of all model parameters to be fit by gradient descent
    params = logreg_layer.params + recurrent_layer.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # update_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    learning_rate = 0.1
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    update_model = theano.function([x, mask], cost, updates=updates)

    evaluate_model = theano.function([x, mask], cost)

    # Define and compile a function for generating a sequence step by step.
    x_t = T.iscalar()
    h_p = T.vector()
    c_p = T.vector()
    h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
    energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b

    energy_exp = T.exp(energy - T.max(energy, 1)[:, None])

    output = energy_exp / energy_exp.sum(1)[:, None]
    single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])

    start_time = time.clock()

    iteration = 0

    for epoch in range(n_epochs):
        print 'epoch:', epoch

        for x_, mask_ in train_stream.get_epoch_iterator():
            iteration += 1

            cross_entropy = update_model(x_.T, mask_.T)


            # Generate some text after each 20 minibatches
            if iteration % 40 == 0:
                try:
                    prediction = numpy.ones(111, dtype=config.floatX) / 111.0
                    h_p = numpy.zeros((n_h,), dtype=config.floatX)
                    c_p = numpy.zeros((n_h,), dtype=config.floatX)
                    initial = 'the meaning of life is '
                    sentence = initial
                    for char in initial:
                        x_t = dictionary[char]
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
                                                           c_p.flatten())
                    sample = numpy.random.multinomial(1, prediction.flatten())
                    for i in range(450):
                        x_t = numpy.argmax(sample)
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
                                                           c_p.flatten())
                        sentence += reverse_mapping[x_t]
                        sample = numpy.random.multinomial(1, prediction.flatten())
                    print 'LSTM: "' + sentence + '"'
                except ValueError:
                    print 'Something went wrong during sentence generation.'

            if iteration % 40 == 0:
                print 'epoch:', epoch, '  minibatch:', iteration
                val_scores = []
                for x_val, mask_val in val_stream.get_epoch_iterator():
                    val_scores.append(evaluate_model(x_val.T, mask_val.T))
                print 'Average validation CE per sentence:', numpy.mean(val_scores)

    end_time = time.clock()
    print('Optimization complete.')
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #35
0
ファイル: test_transformers.py プロジェクト: markusnagel/fuel
 def test_strictness_2_error(self):
     stream = DataStream(IterableDataset([1, 2, 3, 4, 5]))
     transformer = Batch(stream, ConstantScheme(2), strictness=2)
     assert_raises(ValueError, list, transformer.get_epoch_iterator())
コード例 #36
0
ファイル: stream_morph2.py プロジェクト: gumaojie/rnnlm
def DStream(datatype, config):
    if datatype=='train':
        filename = config['train_file']
        filename_morph = config['train_morph_file']
        filename_rel = config['train_rel_file']
    elif datatype == 'valid':
        filename = config['valid_file']
        filename_morph = config['valid_morph_file']
        filename_rel = config['valid_rel_file']
    elif datatype == 'test':
        filename = config['test_file']
        filename_morph = config['test_morph_file']
        filename_rel = config['test_rel_file']
    else:
        logger.error('wrong datatype, train, valid, or test')
    data = TextFile(files=[filename],
                    dictionary=pickle.load(open(config['train_dic'],'rb')),
                    unk_token=config['unk_token'],
                    level='word',
                    bos_token=config['bos_token'],
                    eos_token=config['eos_token'])
    data_morph = TextFile(files=[filename_morph],
                    dictionary=pickle.load(open(config['train_morph_dic'],'rb')),
                    unk_token=config['unk_token'],
                    level='word',
                    bos_token=config['bos_token'],
                    eos_token=config['eos_token'])
    data_stream = DataStream.default_stream(data)
    data_stream.sources = ('sentence',)
    data_morph_stream = DataStream.default_stream(data_morph)
    data_morph_stream.sources = ('sentence',)
    # organize data in batches and pad shorter sequences with zeros
    batch_size = config['batch_size']
    rels_stream = []
    with open(filename_rel , "r") as fin:
        lines = fin.readlines()
        i = 0
        while i < len(lines):
            if i + batch_size < len(lines):
                rels_stream.append(padding(lines[i : i + batch_size]))
                i = i + batch_size
            else:
                rels_stream.append(padding(lines[i : len(lines)]))
                i = i + batch_size
    data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size))
    data_stream = Padding(data_stream)

    data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size))
    data_morph_stream = Padding(data_morph_stream)
    data_morph_tensor3 = []
    mask_morph_tensor3 = []
    #data_morph_stream : batch_num * batch * sentence
    #rels_stream : batch_num * batch * sentence
    #dta_morph_tensor3 : batch_num * batch * sentence * morph
    for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream):
        data_morph , mask_morph = data_morph_tuple
        #data_morph : batch * sentence
        #rel : batch * sentence
        tmp = []
        tmp_mask = []
        for m , mask , r in zip(data_morph , mask_morph , rel):
            #m : sentence
            #r : sentence
            start = 0
            tmp2 = []
            tmp_mask2 = []
            for idx in r:
                tmp2.append(m[start:start+idx].tolist())
                tmp_mask2.append(mask[start:start+idx].tolist())
                #print m[start:start+idx]
                start = start + idx
            #print len(tmp)
            #print padding2(tmp2)
            tmp.append(tmp2)
            tmp_mask.append(tmp_mask2)
            #print len(tmp) , tmp
            #print m , r
            #print m.shape , r.shape
        #print padding2(tmp)
        data_morph_tensor3.append(np.array(padding2(tmp)))
        mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32'))
    return data_stream , data_morph_tensor3 , mask_morph_tensor3
コード例 #37
0
from addition import AdditionTask

from fuel.transformers import Mapping, Batch
from fuel.schemes import ConstantScheme

from numpy import swapaxes

def _transpose(data):
    return tuple(swapaxes(array,0,1) for array in data if len(array.shape) > 2 )

dataset = AdditionTask(17)
data_stream = dataset.get_example_stream()
data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(14))
data_stream = Mapping(data_stream, _transpose)

print next(data_stream.get_epoch_iterator())[0].shape

コード例 #38
0
def main(mode, save_path, steps, num_batches, load_params):
    chars = (list(string.ascii_uppercase) + list(range(10)) +
             [' ', '.', ',', '\'', '"', '!', '?', '<UNK>'])
    char_to_ind = {char: i for i, char in enumerate(chars)}
    ind_to_char = {v: k for k, v in char_to_ind.iteritems()}

    train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')
    valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')

    vocab_size = len(char_to_ind)
    logger.info('Dictionary size: {}'.format(vocab_size))
    if mode == 'continue':
        continue_training(save_path)
        return
    elif mode == "sample":
        main_loop = load(open(save_path, "rb"))
        generator = main_loop.model.get_top_bricks()[-1]

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]
        print("".join([ind_to_char[s] for s in outputs]))

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()

        trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        return

    # Experiment configuration
    batch_size = 20
    dim = 650
    feedback_dim = 650

    valid_stream = valid_dataset.get_example_stream()
    valid_stream = Batch(valid_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    valid_stream = Padding(valid_stream)
    valid_stream = Mapping(valid_stream, _transpose)

    # Build the bricks and initialize them

    transition = GatedRecurrent(name="transition", dim=dim,
                                activation=Tanh())
    generator = SequenceGenerator(
        Readout(readout_dim=vocab_size, source_names=transition.apply.states,
                emitter=SoftmaxEmitter(name="emitter"),
                feedback_brick=LookupFeedback(
                    vocab_size, feedback_dim, name='feedback'),
                name="readout"),
        transition,
        weights_init=Uniform(std=0.04), biases_init=Constant(0),
        name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()
    transition.push_initialization_config()
    generator.initialize()

    # Build the cost computation graph.
    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    cost_matrix = generator.cost_matrix(
        features, mask=features_mask)
    batch_cost = cost_matrix.sum()
    cost = aggregation.mean(
        batch_cost,
        features.shape[1])
    cost.name = "sequence_log_likelihood"
    char_cost = aggregation.mean(
        batch_cost, features_mask.sum())
    char_cost.name = 'character_log_likelihood'
    ppl = 2 ** (cost / numpy.log(2))
    ppl.name = 'ppl'
    bits_per_char = char_cost / tensor.log(2)
    bits_per_char.name = 'bits_per_char'
    length = features.shape[0]
    length.name = 'length'

    model = Model(batch_cost)
    if load_params:
        params = load_parameter_values(save_path)
        model.set_parameter_values(params)

    if mode == "train":
        # Give an idea of what's going on.
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_parameters().items()],
                        width=120))

        train_stream = train_dataset.get_example_stream()
        train_stream = Mapping(train_stream, _truncate)
        train_stream = Batch(train_stream,
                             iteration_scheme=ConstantScheme(batch_size))
        train_stream = Padding(train_stream)
        train_stream = Mapping(train_stream, _transpose)

        parameters = model.get_parameter_dict()
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values())
        algorithm = GradientDescent(
            cost=batch_cost,
            parameters=parameters.values(),
            step_rule=CompositeRule([StepClipping(1000.), 
                AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects)
                                     ]))
        ft = features[:6, 0]
        ft.name = 'feature_example'

        observables = [cost, ppl, char_cost, length, bits_per_char]
        for name, param in parameters.items():
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements ** 0.5
            grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
            step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
            stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
            stats.name = name + '_stats'
            observables.append(stats)
        track_the_best_bpc = TrackTheBest('valid_bits_per_char')
        root_path, extension = os.path.splitext(save_path)

        this_step_monitoring = TrainingDataMonitoring(
            observables + [ft], prefix="this_step", after_batch=True)
        average_monitoring = TrainingDataMonitoring(
            observables + [algorithm.total_step_norm,
                           algorithm.total_gradient_norm], 
            prefix="average",
            every_n_batches=10)
        valid_monitoring = DataStreamMonitoring(
            observables, prefix="valid",
            every_n_batches=1500, before_training=False,
            data_stream=valid_stream)
        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=train_stream,
            model=model,
            extensions=[
                this_step_monitoring,
                average_monitoring,
                valid_monitoring,
                track_the_best_bpc,
                Checkpoint(save_path, ),
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"],
                           use_cpickle=True)
                    .add_condition(
                    ['after_epoch'],
                    OnLogRecord(track_the_best_bpc.notification_name),
                    (root_path + "_best" + extension,)),
                Timing(after_batch=True),
                Printing(every_n_batches=10),
                Plot(root_path,
                     [[average_monitoring.record_name(cost),
                       valid_monitoring.record_name(cost)],
                      [average_monitoring.record_name(algorithm.total_step_norm)],
                      [average_monitoring.record_name(algorithm.total_gradient_norm)],
                      [average_monitoring.record_name(ppl),
                       valid_monitoring.record_name(ppl)],
                      [average_monitoring.record_name(char_cost),
                       valid_monitoring.record_name(char_cost)],
                      [average_monitoring.record_name(bits_per_char),
                       valid_monitoring.record_name(bits_per_char)]],
                     every_n_batches=10)
            ])
        main_loop.run()

    elif mode == 'evaluate':
        with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f:
            raw_words = [line.split()[1:-1] for line in f.readlines()]
            words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] 
                     for w in raw_words]
        max_word_length = max([len(w) for w in words])
        
        initial_states = tensor.matrix('init_states')
        cost_matrix_step = generator.cost_matrix(features, mask=features_mask,
                                                 states=initial_states)
        cg = ComputationGraph(cost_matrix_step)
        states = cg.auxiliary_variables[-2]
        compute_cost = theano.function([features, features_mask, initial_states], 
                                       [cost_matrix_step.sum(axis=0), states])

        cost_matrix = generator.cost_matrix(features, mask=features_mask)
        initial_cg = ComputationGraph(cost_matrix)
        initial_states = initial_cg.auxiliary_variables[-2]

        total_word_cost = 0
        num_words = 0
        examples = numpy.zeros((max_word_length + 1, len(words)),
                               dtype='int64')
        all_masks = numpy.zeros((max_word_length + 1, len(words)),
                                dtype=floatX)

        for i, word in enumerate(words):
            examples[:len(word), i] = word
            all_masks[:len(word), i] = 1.

        single_space = numpy.array([char_to_ind[' ']])[:, None]

        for batch in valid_stream.get_epoch_iterator():
            for example, mask in equizip(batch[0].T, batch[1].T):
                example = example[:(mask.sum())]
                spc_inds = list(numpy.where(example == char_to_ind[" "])[0])
                state = generator.transition.transition.initial_states_.get_value()[None, :]
                for i, j in equizip([-1] + spc_inds, spc_inds + [-1]):
                    word = example[(i+1):j, None]
                    word_cost, states = compute_cost(
                        word, numpy.ones_like(word, dtype=floatX), state)
                    state = states[-1]

                    costs = numpy.exp(-compute_cost(
                        examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0])

                    _, space_states = compute_cost(
                        single_space, numpy.ones_like(single_space, dtype=floatX), state)
                    state = space_states[-1]

                    word_prob = numpy.exp(-word_cost)
                    total_word_cost += word_cost + numpy.log(numpy.sum(costs))
                    num_words += 1
                    print(word_prob)
                    print(numpy.sum(costs))
                    print("Average cost", total_word_cost / num_words)
                    print("PPL", numpy.exp(total_word_cost / num_words))

        print("Word-level perplexity")
        print(total_word_cost / num_words)
    else:
        assert False
コード例 #39
0
ファイル: stream.py プロジェクト: gumaojie/morphlm
def DStream(datatype, config):
    if datatype in ['train','valid','test']:
        filename = config[datatype + '_file']
        filename_morph = config[datatype + '_morph_file']
        filename_rel = config[datatype + '_rel_file']
    else:
        logger.error('wrong datatype, train, valid, or test')
    data_stream = getTextFile(filename, config['train_dic'], config)
    data_morph_stream = getTextFile(filename_morph, config['train_morph_dic'], config)
    # organize data in batches and pad shorter sequences with zeros
    batch_size = config['batch_size']
    rels_stream = []
    cnt = 0
    with open(filename_rel , "r") as fin:
        lines = fin.readlines()
        i = 0
        while i < len(lines):
            if i + batch_size < len(lines):
                rels_stream.append(padding(lines[i : i + batch_size]))
                i = i + batch_size
            else:
                rels_stream.append(padding(lines[i : len(lines)]))
                i = i + batch_size
    data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size))
    data_stream = Padding(data_stream)
    data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size))
    data_morph_stream = Padding(data_morph_stream)
    data_morph_tensor3 = []
    mask_morph_tensor3 = []
    #data_morph_stream : batch_size * batch * sentence
    #rels_stream : batch_num * batch * sentence
    #data_morph_tensor3 : batch_num * batch * sentence * morph
    cnt = 0
    for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream):
        data_morph , mask_morph = data_morph_tuple
        #data_morph : batch * sentence
        #rel : batch * sentence
        tmp = []
        tmp_mask = []
        for m , mask , r in zip(data_morph , mask_morph , rel):
            start = 0
            tmp2 = []
            tmp_mask2 = []
            for idx in r:
                tmp2.append(m[start:start+idx].tolist())
                tmp_mask2.append(mask[start:start+idx].tolist())
                #print m[start:start+idx]
                start = start + idx
            #print len(tmp)
            #print padding2(tmp2)
            tmp.append(tmp2)
            tmp_mask.append(tmp_mask2)
            #print len(tmp) , tmp
            #print m.shape , r.shape
        #print padding2(tmp)
        data_morph_tensor3.append(np.array(padding2(tmp)))
        mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32'))
        cnt += 1
    '''
    cnt = 0
    for a, b, c in zip(data_stream.get_epoch_iterator() , mask_morph_tensor3, mask_morph_tensor3):
        data , mask = a
        if data.shape[1] != b.shape[1]:
            print data.shape , b.shape, c.shape
            cnt2 = 0
            for i , d in enumerate(data):
                if cnt2 == 42:
                    print i , len(d) , d
                    dic2 = load_dic()
                    for key in d:
                        if key in dic2 and key != 0:
                            print dic2[key],
                cnt2 += 1
            print cnt
            #print data.shape , b[99]
            exit(0)
            print "###"
        cnt += 1
    exit(0)
    '''
    return data_stream , data_morph_tensor3 , mask_morph_tensor3