Esempio n. 1
0
 def test_filter_examples(self):
     data = [1, 2, 3]
     data_filtered = [1, 3]
     stream = DataStream(IterableDataset(data))
     wrapper = Filter(stream, lambda d: d[0] % 2 == 1)
     assert_equal(list(wrapper.get_epoch_iterator()),
                  list(zip(data_filtered)))
Esempio n. 2
0
 def test_filter_batches(self):
     data = [1, 2, 3, 4]
     data_filtered = [([3, 4],)]
     stream = DataStream(IndexableDataset(data),
                         iteration_scheme=SequentialScheme(4, 2))
     wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0)
     assert_equal(list(wrapper.get_epoch_iterator()), data_filtered)
Esempio n. 3
0
 def test_filter_batches(self):
     data = [1, 2, 3, 4]
     data_filtered = [([3, 4],)]
     stream = DataStream(IndexableDataset(data),
                         iteration_scheme=SequentialScheme(4, 2))
     wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0)
     assert_equal(list(wrapper.get_epoch_iterator()), data_filtered)
Esempio n. 4
0
 def test_filter_examples(self):
     data = [1, 2, 3]
     data_filtered = [1, 3]
     stream = DataStream(IterableDataset(data))
     wrapper = Filter(stream, lambda d: d[0] % 2 == 1)
     assert_equal(list(wrapper.get_epoch_iterator()),
                  list(zip(data_filtered)))
Esempio n. 5
0
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size,
                      seq_len, **kwargs):
    """Creates the stream which is used for the main loop.
    
    Args:
        src_data (string): Path to the source sentences
        trg_data (string): Path to the target sentences
        src_vocab_size (int): Size of the source vocabulary in the NMT
                              model
        trg_vocab_size (int): Size of the target vocabulary in the NMT
                              model
        seq_len (int): Maximum length of any source or target sentence
    
    Returns:
        ExplicitNext. Alignment data stream which can be iterated
        explicitly
    """
    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))
    s = Batch(s, iteration_scheme=ConstantScheme(1))
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])
    return ExplicitNext(masked_stream)
Esempio n. 6
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Esempio n. 7
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000,
                  unk_id=0, eos_id=1, bos_id=2, train_noise=0,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise)
    trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0)

    # Merge them to get a source, target pair
    stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_not_too_long(seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    return PaddingWithEOS(stream, [eos_id, eos_id])
Esempio n. 8
0
 def test_axis_labels_are_passed_through(self):
     stream = DataStream(
         IndexableDataset(
             {'features': [1, 2, 3, 4]},
             axis_labels={'features': ('batch',)}),
         iteration_scheme=SequentialScheme(4, 2))
     wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0)
     assert_equal(wrapper.axis_labels, stream.axis_labels)
Esempio n. 9
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Esempio n. 10
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 11
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
def build_stream(dataset, n_grams, batch_size, times=None):
    example_stream = dataset.get_example_stream()

    example_stream = Filter(example_stream, reject_repeated_words)
    n_gram_stream = NGrams(n_grams, example_stream)

    batch_stream = Batch(n_gram_stream,
                         ConstantScheme(batch_size, times=times),
                         strictness=1)

    def reshape(batch):
        return batch[0].astype("int32"), batch[1][:, None].astype("int32")

    return Mapping(batch_stream, reshape)
Esempio n. 13
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 14
0
    def train(self, req_vars):
        stream = TaxiDataset('train', data.traintest_ds)

        if hasattr(
                self.config,
                'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream,
                                iteration_scheme=ShuffledExampleScheme(
                                    stream.num_examples))

        if not data.tvt:
            valid = TaxiDataset(data.valid_set,
                                data.valid_ds,
                                sources=('trip_id', ))
            valid_trips_ids = valid.get_data(None,
                                             slice(0, valid.num_examples))[0]
            stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)

        if hasattr(self.config, 'max_splits'):
            stream = transformers.TaxiGenerateSplits(
                stream, max_splits=self.config.max_splits)
        elif not data.tvt:
            stream = transformers.add_destination(stream)

        if hasattr(self.config, 'train_max_len'):
            idx = stream.sources.index('latitude')

            def max_len_filter(x):
                return len(x[idx]) <= self.config.train_max_len

            stream = Filter(stream, max_len_filter)

        stream = transformers.TaxiExcludeEmptyTrips(stream)
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = transformers.balanced_batch(
            stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        stream = MultiProcessing(stream)

        return stream
Esempio n. 15
0
def get_dev_stream_with_grdTruth(val_set_source=None,
                                 val_set_target=None,
                                 src_vocab=None,
                                 src_vocab_size=30000,
                                 trg_vocab=None,
                                 trg_vocab_size=30000,
                                 batch_size=128,
                                 unk_id=1,
                                 seq_len=50,
                                 **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)

        print val_set_source, type(src_vocab)
        dev_dataset = TextFile([val_set_source], src_vocab, None)
        trg_dataset = TextFile([val_set_target], trg_vocab, None)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('dev_source', 'dev_target'))
        # Filter sequences that are too long
        stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len))

        # Replace out of vocabulary tokens with unk token
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=src_vocab_size,
                        trg_vocab_size=trg_vocab_size,
                        unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        # Pad sequences that are short
        masked_stream = PaddingWithEOS(
            stream, [src_vocab_size - 1, trg_vocab_size - 1])
    return masked_stream
Esempio n. 16
0
def load_parallel_data(src_file,
                       tgt_file,
                       batch_size,
                       sort_k_batches,
                       dictionary,
                       training=False):
    def preproc(s):
        s = s.replace('``', '"')
        s = s.replace('\'\'', '"')
        return s

    enc_dset = TextFile(files=[src_file],
                        dictionary=dictionary,
                        bos_token=None,
                        eos_token=None,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    dec_dset = TextFile(files=[tgt_file],
                        dictionary=dictionary,
                        bos_token=CHAR_SOS_TOK,
                        eos_token=CHAR_EOS_TOK,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    # NOTE merge encoder and decoder setup together
    stream = Merge(
        [enc_dset.get_example_stream(),
         dec_dset.get_example_stream()], ('source', 'target'))
    if training:
        # filter sequences that are too long
        stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN))
        # batch and read k batches ahead
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(batch_size *
                                                       sort_k_batches))
        # sort all samples in read-ahead batch
        stream = Mapping(stream, SortMapping(lambda x: len(x[1])))
        # turn back into stream
        stream = Unpack(stream)
    # batch again
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    masked_stream = Padding(stream)
    return masked_stream
Esempio n. 17
0
def get_sentence_stream(which_set, which_partitions, vocabulary):
    """Return an iterator over sentences

    Notes
    -----
    This reads the text files sequentially. However, note that the files are
    already shuffled.

    """
    # Construct data stream
    logger.info('Constructing data stream')
    dataset = OneBillionWord(which_set, which_partitions, vocabulary)
    data_stream = dataset.get_example_stream()

    # Get rid of long sentences that don't fit
    data_stream = Filter(data_stream, _filter_long)

    # Creates the dataset "targets"
    data_stream = Mapping(data_stream, _shift_words, add_sources=("targets",))

    return data_stream
Esempio n. 18
0
    def get_stream(self, part, batches=True, shuffle=True,
                   add_sources=()):
        dataset = self.get_dataset(part, add_sources=add_sources)
        stream = (DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(dataset.num_examples))
                  if shuffle
                  else dataset.get_example_stream())

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            if self.prepend_eos:
                stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label))
            else:
                stream = Mapping(stream, _AddEosLabelEnd(self.eos_label))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Esempio n. 19
0
def _get_sgnmt_tr_stream(data_stream,
                         src_vocab_size=30000,
                         trg_vocab_size=30000,
                         seq_len=50,
                         batch_size=80,
                         sort_k_batches=12,
                         src_sparse_feat_map='',
                         trg_sparse_feat_map='',
                         **kwargs):
    """Prepares the raw text file stream ``data_stream`` for the Blocks
    main loop. This includes handling UNKs, splitting ino batches, sort
    locally by sequence length, and masking. This roughly corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples.
    
    The arguments to this method are given by the configuration dict.
    """

    # Filter sequences that are too long
    s = Filter(data_stream, predicate=stream._too_long(seq_len=seq_len))

    # Replacing out of vocabulary tokens with unk token already
    # handled in the `DataSet`s

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 20
0
def get_tr_stream(path, src_eos_idx, phones_sil, tgt_eos_idx, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'phones_words_acoustic_ends')
    #sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends')
    dataset = H5PYDataset(path, which_sets=('train',), sources=sources, load_in_memory=False)
    print "creating example stream"
    stream = dataset.get_example_stream()
    print "example stream created"

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream, {
        'words': src_eos_idx,
        'phones': phones_sil,
        'punctuation_marks': tgt_eos_idx,
        'audio': 0,
        'words_ends': -1,
        'phones_words_ends': -1,
        'phones_words_acoustic_ends': -1,
    })

    return masked_stream
Esempio n. 21
0
def load_data(src_file, tgt_file, batch_size, sort_k_batches, training=False):
    src_dict, tgt_dict = load_dictionaries()

    src_dset = TextFile(files=[src_file], dictionary=src_dict,
            bos_token=None, eos_token=None, unk_token=WORD_UNK_TOK)
    tgt_dset = TextFile(files=[tgt_file], dictionary=tgt_dict,
            bos_token=WORD_EOS_TOK, eos_token=WORD_EOS_TOK, unk_token=WORD_UNK_TOK)

    stream = Merge([src_dset.get_example_stream(), tgt_dset.get_example_stream()],
            ('source', 'target'))
    # filter sequences that are too long
    if training:
        stream = Filter(stream, predicate=TooLong(seq_len=WORD_MAX_SEQ_LEN))
        # batch and read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))
        # sort all samples in read-ahead batch
        stream = Mapping(stream, SortMapping(lambda x: len(x[1])))
        # turn back into stream
        stream = Unpack(stream)
    # batch again
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    # NOTE pads with zeros so eos_idx should be 0
    masked_stream = Padding(stream)
    return masked_stream, src_dict, tgt_dict
Esempio n. 22
0
    def get_one_stream(self, part, lang=None, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None, num_result=None,
                   soften_distributions=None, only_stream=False):
        assert lang in self.langs
        dataset = self.get_dataset(part, lang, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        if num_result is None:
            num_result = num_examples

        if lang != self.langs[0] and not only_stream:
            iteration_scheme = RandomExampleScheme(num_examples, num_result=num_result, rng=rng)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        if soften_distributions:
            stream = Mapping(stream, SoftenResult(self.default_sources, soften_distributions))

        for bconv in self._binary_convertable_data:
            if bconv in self.default_sources:
                stream = Mapping(stream, ConvertToMask(self.default_sources,
                                                       bconv,
                                                       self.num_features(bconv)))

        if self.add_eos:
            stream = Mapping(stream, _AddLabel(
                self.eos_label,
                index=stream.sources.index(self.sources_map['labels'])))
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = Mapping(stream, _AddLabel(
                self.bos_label, append=False, times=self.add_bos,
                index=stream.sources.index(self.sources_map['labels'])))

        if self.max_length:
            stream = Filter(stream, self.length_filter)

        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            #
            # Hardcode 0 for source on which to sort. This will be good, as
            # most source lengths are correlated and, furthermore, the
            # labels will typically be the last source, thus in a single-input
            # case this sorts on input lengths
            #
            stream = Mapping(stream, SortMapping(_Length(
                index=0)))
            stream = Unpack(stream)

        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        stream = Rename(stream,
                        names=dict_subset({v: k for (k, v)
                                           in self.sources_map.items()},
                                          stream.sources,
                                          must_have=False))
        if not batches:
            return stream, num_examples

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))

        stream._produces_examples = False
        return stream, num_examples
Esempio n. 23
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):
        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        if self.add_eos:
            stream = Mapping(stream, _AddLabel(
                self.eos_label,
                index=stream.sources.index(self.sources_map['labels'])))
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = Mapping(stream, _AddLabel(
                self.bos_label, append=False, times=self.add_bos,
                index=stream.sources.index(self.sources_map['labels'])))

        if self.max_length:
            stream = Filter(stream, self.length_filter)

        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            #
            # Hardcode 0 for source on which to sort. This will be good, as
            # most source lengths are correlated and, furthermore, the
            # labels will typically be the last source, thus in a single-input
            # case this sorts on input lengths
            #
            stream = Mapping(stream, SortMapping(_Length(
                index=0)))
            stream = Unpack(stream)

        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        stream = Rename(stream,
                        names=dict_subset({v: k for (k, v)
                                           in self.sources_map.items()},
                                          stream.sources,
                                          must_have=False))
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        stream._produces_examples = False
        return stream
Esempio n. 24
0
def test_filter():
    data = [1, 2, 3]
    data_filtered = [1, 3]
    stream = DataStream(IterableDataset(data))
    wrapper = Filter(stream, lambda d: d[0] % 2 == 1)
    assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))
Esempio n. 25
0
    def get_stream(self,
                   part,
                   batches=True,
                   shuffle=True,
                   add_sources=(),
                   num_examples=None,
                   rng=None,
                   seed=None):
        dataset = self.get_dataset(part, add_sources=add_sources)
        iteration_scheme = None
        if self.use_iteration_scheme:
            if num_examples is None:
                num_examples = dataset.num_examples
            if shuffle:
                iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
            else:
                iteration_scheme = SequentialExampleScheme(num_examples)
        stream = DataStream(dataset, iteration_scheme=iteration_scheme)

        # Transformations before rearrangement
        labels_source = self.sources_map['labels']
        if self.add_eos:
            stream = _AddLabel(stream,
                               self.eos_label,
                               which_sources=[labels_source])
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = _AddLabel(stream,
                               self.bos_label,
                               append=False,
                               times=self.add_bos,
                               which_sources=[labels_source])
        if self.clip_length:
            stream = _Clip(stream,
                           self.clip_length,
                           force_eos=self.eos_label
                           if self.force_eos_when_clipping else None,
                           which_sources=[labels_source])

        # More efficient packing of examples in batches
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_Length(index=0)))
            stream = Unpack(stream)

        stream = Rearrange(
            stream,
            dict_subset(self.sources_map,
                        self.default_sources + list(add_sources)))

        # Tranformations after rearrangement
        if self.corrupt_sources:
            # Can only corrupt sources with the same alphabet
            # as labels
            for source, prob in zip(self.corrupt_sources['names'],
                                    self.corrupt_sources['probs']):
                stream = _Corrupt(stream,
                                  prob,
                                  self.token_map(source),
                                  self.eos_label,
                                  which_sources=[source])
        if self.max_length and part == 'train':
            # Filtering by the maximum length is only done
            # for the training set.
            self.length_filter = _LengthFilter(indices=[
                i for i, source in enumerate(stream.sources)
                if source in self.filter_by
            ],
                                               max_length=self.max_length)
            stream = Filter(stream, self.length_filter)
        stream = ForceFloatX(stream)

        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Esempio n. 26
0
def setup_model_and_stream(exp_config, source_vocab, target_vocab):

    # TODO: this line is a mess
    sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \
    train_decoder, generated = \
        get_sampling_model_and_input(exp_config)

    trg_vocab = target_vocab
    trg_vocab_size = exp_config['trg_vocab_size']
    src_vocab = source_vocab
    src_vocab_size = exp_config['src_vocab_size']

    theano_sample_func = sample_model.get_theano_function()
    sampling_func = SampleFunc(theano_sample_func, trg_vocab)

    # TODO: move stream creation to nn_imt.stream
    # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000,
    #                         unk_id=1, bos_token=None):
    src_stream = get_textfile_stream(
        source_file=exp_config['src_data'],
        src_vocab=exp_config['src_vocab'],
        src_vocab_size=exp_config['src_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    trg_stream = get_textfile_stream(
        source_file=exp_config['trg_data'],
        src_vocab=exp_config['trg_vocab'],
        src_vocab_size=exp_config['trg_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    # text file stream
    training_stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long (Note this may break)
    training_stream = Filter(
        training_stream, predicate=_too_long(seq_len=exp_config['seq_len']))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    training_stream = Mapping(
        training_stream,
        _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'],
                    trg_vocab_size=exp_config['trg_vocab_size'],
                    unk_id=exp_config['unk_id']))

    # add in the prefix and suffix seqs
    # working: add the sample ratio
    logger.info('Sample ratio is: {}'.format(exp_config.get(
        'sample_ratio', 1.)))
    training_stream = Mapping(
        training_stream,
        PrefixSuffixStreamTransformer(
            sample_ratio=exp_config.get('sample_ratio', 1.)),
        add_sources=('target_prefix', 'target_suffix'))

    training_stream = Mapping(
        training_stream, CopySourceAndTargetToMatchPrefixes(training_stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    training_stream.produces_examples = False

    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    training_stream = Unpack(training_stream)

    # METEOR
    trg_ivocab = {v: k for k, v in trg_vocab.items()}

    # TODO: Implement smoothed BLEU
    # TODO: Implement first-word accuracy (bilingual language model)

    min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu')

    if min_risk_score_func == 'meteor':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_meteor,
            num_samples=exp_config['n_samples'],
            trg_ivocab=trg_ivocab,
            lang=exp_config['target_lang'],
            meteor_directory=exp_config['meteor_directory'])
    elif min_risk_score_func == 'imt_f1':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_imt_f1,
            num_samples=exp_config['n_samples'])
    # BLEU is default
    else:
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_bleu,
            num_samples=exp_config['n_samples'])

    training_stream = Mapping(training_stream,
                              sampling_transformer,
                              add_sources=('samples', 'seq_probs', 'scores'))

    # now filter out segments whose samples are too good or too bad
    training_stream = Filter(training_stream, predicate=filter_by_sample_score)

    # Now make a very big batch that we can shuffle
    # Build a batched version of stream to read k batches ahead
    shuffle_batch_size = exp_config['shuffle_batch_size']
    training_stream = Batch(
        training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    training_stream = ShuffleBatchTransformer(training_stream)

    # unpack it again
    training_stream = Unpack(training_stream)

    # Build a batched version of stream to read k batches ahead
    batch_size = exp_config['batch_size']
    sort_k_batches = exp_config['sort_k_batches']
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size *
                                                            sort_k_batches))

    # Sort all samples in the read-ahead batch
    training_stream = Mapping(training_stream, SortMapping(_length))

    # Convert it into a stream again
    training_stream = Unpack(training_stream)

    # Construct batches from the stream with specified batch size
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size))

    # IDEA: add a transformer which flattens the target samples before we add the mask
    flat_sample_stream = FlattenSamples(training_stream)

    expanded_source_stream = CopySourceAndPrefixNTimes(
        flat_sample_stream, n_samples=exp_config['n_samples'])

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    # Note: we shouldn't need to pad the seq_probs because there is only one per sequence
    # TODO: DEVELOPMENT HACK
    exp_config['suffix_length'] = 1
    exp_config['truncate_sources'] = ['target_suffix']
    configurable_padding_args = {
        'suffix_length': exp_config.get('suffix_length', None),
        'truncate_sources': exp_config.get('truncate_sources', [])
    }
    import ipdb
    ipdb.set_trace()
    masked_stream = PaddingWithEOS(expanded_source_stream, [
        src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1,
        trg_vocab_size - 1, trg_vocab_size - 1
    ],
                                   mask_sources=('source', 'target',
                                                 'target_prefix',
                                                 'target_suffix', 'samples'),
                                   **configurable_padding_args)

    return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
Esempio n. 27
0
        '/home/andrewsm/SEER/external/CoNLL2003/ner/eng.testb',
    ]  # 748Kb file
else:
    data_paths = [
        '/home/andrewsm/SEER/external/CoNLL2003/ner/eng.train',
    ]  # 3.3Mb file

## Achieved result: 50-epochs (GPU) training on eng.train => testb overall scores :
## accuracy:  96.42%; precision:  76.95%; recall:  80.26%; FB1:  78.57

dataset = CoNLLTextFile(data_paths,
                        dictionary=word2code,
                        unknown_token='<UNK>')

data_stream = DataStream(dataset)
data_stream = Filter(data_stream, _filter_long)
#data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",))

data_stream = Batch(data_stream,
                    iteration_scheme=ConstantScheme(mini_batch_size))

#data_stream = Padding(data_stream, mask_sources=('tokens'))            # Adds a mask fields to this stream field, type='floatX'
data_stream = Padding(
    data_stream,
)  # Adds a mask fields to all of this stream's fields, type='floatX'
data_stream = Mapping(
    data_stream, _transpose
)  # Flips stream so that sentences run down columns, batches along rows (strangely)

if False:  # print sample for debugging Dataset / DataStream component
    #t=0
Esempio n. 28
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  bos_token=None,
                  **kwargs):
    """Prepares the training data stream."""
    if type(bos_token) is str:
        bos_token = bos_token.decode('utf8')

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data],
                           src_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')
    trg_dataset = TextFile([trg_data],
                           trg_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')

    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    unk_id=unk_id))

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000)
    stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream, src_vocab, trg_vocab
Esempio n. 29
0
def get_src_trg_stream(cg,
                       config,
                       src_datasets=None,
                       trg_datasets=None,
                       is_training=True,
                       src_vocabs=None,
                       trg_vocabs=None,
                       logprob_datasets=None):
    eid, did = p_(cg)
    if is_training:
        logger.info(' ... src:[{}] - [{}]'.format(eid,
                                                  src_datasets[cg].files[0]))
        logger.info(' ... trg:[{}] - [{}]'.format(did,
                                                  trg_datasets[cg].files[0]))
        stream = Merge([
            src_datasets[cg].get_example_stream(),
            trg_datasets[cg].get_example_stream()
        ], ('source', 'target'))
        stream = Filter(stream, predicate=_too_long(config['seq_len']))

        if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0:
            stream = Filter(stream,
                            predicate=_too_short(config['min_seq_lens'][cg]))

        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(config['batch_sizes'][cg] *
                                            config['sort_k_batches']))

        stream = Mapping(stream, SortMapping(_length))
        stream = Unpack(stream)
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(
                           config['batch_sizes'][cg]))
    else:  # logprob stream
        src_dataset = TextFile([logprob_datasets[cg][0]],
                               src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([logprob_datasets[cg][1]],
                               trg_vocabs[p_(cg)[1]], None)
        stream = Merge([
            src_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('source', 'target'))
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']
        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))
    return masked_stream
Esempio n. 30
0
def get_tr_stream_with_topic_target(src_vocab, trg_vocab,topic_vocab_input,topic_vocab_output, src_data, trg_data,topical_data,
                  src_vocab_size=30000, trg_vocab_size=30000,trg_topic_vocab_size=2000,source_topic_vocab_size=2000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist

    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)
    topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb'));
    topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it
    topic_binary_vocab={};
    for k,v in topic_vocab_output.items():
        if k=='<UNK>':
            topic_binary_vocab[k]=0;
        else:
            topic_binary_vocab[k]=1;


    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    src_topic_input=TextFile([topical_data],topic_vocab_input,None,None,'rt')
    trg_topic_dataset = TextFile([trg_data],topic_vocab_output,None);
    trg_topic_binary_dataset= TextFile([trg_data],topic_binary_vocab,None);

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream(),
                    src_topic_input.get_example_stream(),
                    trg_topic_dataset.get_example_stream(),
                    trg_topic_binary_dataset.get_example_stream()],
                   ('source', 'target','source_topical','target_topic','target_binary_topic'))


    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # The topical part are not contained of it, check~
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 src_topic_vocab_size=source_topic_vocab_size,
                                 trg_topic_vocab_size=trg_topic_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1])

    return masked_stream
Esempio n. 31
0
def test_data_stream_filter():
    data = [1, 2, 3]
    data_filtered = [1, 3]
    stream = DataStream(IterableDataset(data))
    wrapper = Filter(stream, lambda d: d[0] % 2 == 1)
    assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))
Esempio n. 32
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_files,
                  trg_files,
                  encoding='UTF-8',
                  preprocess=to_lower_case,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  eos='</S>',
                  eos_id=0,
                  unk='<UNK>',
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  **kwargs):
    """Prepares the training data stream."""

    src_dataset = TextFile(src_files,
                           src_vocab,
                           preprocess=preprocess,
                           bos_token=None,
                           eos_token=eos,
                           unk_token=unk,
                           encoding=encoding)
    trg_dataset = TextFile(trg_files,
                           trg_vocab,
                           preprocess=preprocess,
                           bos_token=None,
                           eos_token=eos,
                           unk_token=unk,
                           encoding=encoding)

    src_data_stream = DataStream(src_dataset)
    trg_data_stream = DataStream(trg_dataset)

    # Replace out of vocabulary tokens with unk token
    if src_vocab_size < len(src_vocab):
        src_data_stream = Mapping(
            src_data_stream,
            _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id))

    if trg_vocab_size < len(trg_vocab):
        trg_data_stream = Mapping(
            trg_data_stream,
            _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id))

    # Merge them to get a source, target pair
    stream = Merge([src_data_stream, trg_data_stream], ('source', 'target'))

    # Filter sequences that are too long (either source or target)
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length(target_source_index=1)))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    stream = _PaddingWithToken(stream, eos_id)

    # Attach one-hot ground truth data stream
    stream = Mapping(stream,
                     _to_one_hot(target_source_index=2,
                                 vacabuary_size=trg_vocab_size),
                     add_sources=("one_hot_ground_truth", ))

    return stream
Esempio n. 33
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weghts_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost,  batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in params.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            application=generator.readout.readout,
            name="output")(cg.variables)
        (activations,) = VariableFilter(
            application=generator.transition.apply,
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(abs(activations).mean(),
                                     "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, param in params.items():
            observables.append(named_copy(
                param.norm(2), name + "_norm"))
            observables.append(named_copy(
                algorithm.gradients[param].norm(2), name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition("after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_param_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n" if mode == "sample"
                       else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)