def test_filter_examples(self): data = [1, 2, 3] data_filtered = [1, 3] stream = DataStream(IterableDataset(data)) wrapper = Filter(stream, lambda d: d[0] % 2 == 1) assert_equal(list(wrapper.get_epoch_iterator()), list(zip(data_filtered)))
def test_filter_batches(self): data = [1, 2, 3, 4] data_filtered = [([3, 4],)] stream = DataStream(IndexableDataset(data), iteration_scheme=SequentialScheme(4, 2)) wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0) assert_equal(list(wrapper.get_epoch_iterator()), data_filtered)
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=0, eos_id=1, bos_id=2, train_noise=0, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise) trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0) # Merge them to get a source, target pair stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_not_too_long(seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short return PaddingWithEOS(stream, [eos_id, eos_id])
def test_axis_labels_are_passed_through(self): stream = DataStream( IndexableDataset( {'features': [1, 2, 3, 4]}, axis_labels={'features': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0) assert_equal(wrapper.axis_labels, stream.axis_labels)
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def build_stream(dataset, n_grams, batch_size, times=None): example_stream = dataset.get_example_stream() example_stream = Filter(example_stream, reject_repeated_words) n_gram_stream = NGrams(n_grams, example_stream) batch_stream = Batch(n_gram_stream, ConstantScheme(batch_size, times=times), strictness=1) def reshape(batch): return batch[0].astype("int32"), batch[1][:, None].astype("int32") return Mapping(batch_stream, reshape)
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def train(self, req_vars): stream = TaxiDataset('train', data.traintest_ds) if hasattr( self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) if not data.tvt: valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) if hasattr(self.config, 'max_splits'): stream = transformers.TaxiGenerateSplits( stream, max_splits=self.config.max_splits) elif not data.tvt: stream = transformers.add_destination(stream) if hasattr(self.config, 'train_max_len'): idx = stream.sources.index('latitude') def max_len_filter(x): return len(x[idx]) <= self.config.train_max_len stream = Filter(stream, max_len_filter) stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch( stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) stream = MultiProcessing(stream) return stream
def get_dev_stream_with_grdTruth(val_set_source=None, val_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, seq_len=50, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print val_set_source, type(src_vocab) dev_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('dev_source', 'dev_target')) # Filter sequences that are too long stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def load_parallel_data(src_file, tgt_file, batch_size, sort_k_batches, dictionary, training=False): def preproc(s): s = s.replace('``', '"') s = s.replace('\'\'', '"') return s enc_dset = TextFile(files=[src_file], dictionary=dictionary, bos_token=None, eos_token=None, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) dec_dset = TextFile(files=[tgt_file], dictionary=dictionary, bos_token=CHAR_SOS_TOK, eos_token=CHAR_EOS_TOK, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) # NOTE merge encoder and decoder setup together stream = Merge( [enc_dset.get_example_stream(), dec_dset.get_example_stream()], ('source', 'target')) if training: # filter sequences that are too long stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN)) # batch and read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # sort all samples in read-ahead batch stream = Mapping(stream, SortMapping(lambda x: len(x[1]))) # turn back into stream stream = Unpack(stream) # batch again stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) masked_stream = Padding(stream) return masked_stream
def get_sentence_stream(which_set, which_partitions, vocabulary): """Return an iterator over sentences Notes ----- This reads the text files sequentially. However, note that the files are already shuffled. """ # Construct data stream logger.info('Constructing data stream') dataset = OneBillionWord(which_set, which_partitions, vocabulary) data_stream = dataset.get_example_stream() # Get rid of long sentences that don't fit data_stream = Filter(data_stream, _filter_long) # Creates the dataset "targets" data_stream = Mapping(data_stream, _shift_words, add_sources=("targets",)) return data_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=()): dataset = self.get_dataset(part, add_sources=add_sources) stream = (DataStream(dataset, iteration_scheme=ShuffledExampleScheme(dataset.num_examples)) if shuffle else dataset.get_example_stream()) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: if self.prepend_eos: stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label)) else: stream = Mapping(stream, _AddEosLabelEnd(self.eos_label)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def _get_sgnmt_tr_stream(data_stream, src_vocab_size=30000, trg_vocab_size=30000, seq_len=50, batch_size=80, sort_k_batches=12, src_sparse_feat_map='', trg_sparse_feat_map='', **kwargs): """Prepares the raw text file stream ``data_stream`` for the Blocks main loop. This includes handling UNKs, splitting ino batches, sort locally by sequence length, and masking. This roughly corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples. The arguments to this method are given by the configuration dict. """ # Filter sequences that are too long s = Filter(data_stream, predicate=stream._too_long(seq_len=seq_len)) # Replacing out of vocabulary tokens with unk token already # handled in the `DataSet`s # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_tr_stream(path, src_eos_idx, phones_sil, tgt_eos_idx, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'phones_words_acoustic_ends') #sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends') dataset = H5PYDataset(path, which_sets=('train',), sources=sources, load_in_memory=False) print "creating example stream" stream = dataset.get_example_stream() print "example stream created" # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, { 'words': src_eos_idx, 'phones': phones_sil, 'punctuation_marks': tgt_eos_idx, 'audio': 0, 'words_ends': -1, 'phones_words_ends': -1, 'phones_words_acoustic_ends': -1, }) return masked_stream
def load_data(src_file, tgt_file, batch_size, sort_k_batches, training=False): src_dict, tgt_dict = load_dictionaries() src_dset = TextFile(files=[src_file], dictionary=src_dict, bos_token=None, eos_token=None, unk_token=WORD_UNK_TOK) tgt_dset = TextFile(files=[tgt_file], dictionary=tgt_dict, bos_token=WORD_EOS_TOK, eos_token=WORD_EOS_TOK, unk_token=WORD_UNK_TOK) stream = Merge([src_dset.get_example_stream(), tgt_dset.get_example_stream()], ('source', 'target')) # filter sequences that are too long if training: stream = Filter(stream, predicate=TooLong(seq_len=WORD_MAX_SEQ_LEN)) # batch and read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # sort all samples in read-ahead batch stream = Mapping(stream, SortMapping(lambda x: len(x[1]))) # turn back into stream stream = Unpack(stream) # batch again stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # NOTE pads with zeros so eos_idx should be 0 masked_stream = Padding(stream) return masked_stream, src_dict, tgt_dict
def get_one_stream(self, part, lang=None, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None, num_result=None, soften_distributions=None, only_stream=False): assert lang in self.langs dataset = self.get_dataset(part, lang, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) if num_result is None: num_result = num_examples if lang != self.langs[0] and not only_stream: iteration_scheme = RandomExampleScheme(num_examples, num_result=num_result, rng=rng) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if soften_distributions: stream = Mapping(stream, SoftenResult(self.default_sources, soften_distributions)) for bconv in self._binary_convertable_data: if bconv in self.default_sources: stream = Mapping(stream, ConvertToMask(self.default_sources, bconv, self.num_features(bconv))) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream, num_examples stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream._produces_examples = False return stream, num_examples
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) stream._produces_examples = False return stream
def test_filter(): data = [1, 2, 3] data_filtered = [1, 3] stream = DataStream(IterableDataset(data)) wrapper = Filter(stream, lambda d: d[0] % 2 == 1) assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) iteration_scheme = None if self.use_iteration_scheme: if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream(dataset, iteration_scheme=iteration_scheme) # Transformations before rearrangement labels_source = self.sources_map['labels'] if self.add_eos: stream = _AddLabel(stream, self.eos_label, which_sources=[labels_source]) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = _AddLabel(stream, self.bos_label, append=False, times=self.add_bos, which_sources=[labels_source]) if self.clip_length: stream = _Clip(stream, self.clip_length, force_eos=self.eos_label if self.force_eos_when_clipping else None, which_sources=[labels_source]) # More efficient packing of examples in batches if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_Length(index=0))) stream = Unpack(stream) stream = Rearrange( stream, dict_subset(self.sources_map, self.default_sources + list(add_sources))) # Tranformations after rearrangement if self.corrupt_sources: # Can only corrupt sources with the same alphabet # as labels for source, prob in zip(self.corrupt_sources['names'], self.corrupt_sources['probs']): stream = _Corrupt(stream, prob, self.token_map(source), self.eos_label, which_sources=[source]) if self.max_length and part == 'train': # Filtering by the maximum length is only done # for the training set. self.length_filter = _LengthFilter(indices=[ i for i, source in enumerate(stream.sources) if source in self.filter_by ], max_length=self.max_length) stream = Filter(stream, self.length_filter) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def setup_model_and_stream(exp_config, source_vocab, target_vocab): # TODO: this line is a mess sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \ train_decoder, generated = \ get_sampling_model_and_input(exp_config) trg_vocab = target_vocab trg_vocab_size = exp_config['trg_vocab_size'] src_vocab = source_vocab src_vocab_size = exp_config['src_vocab_size'] theano_sample_func = sample_model.get_theano_function() sampling_func = SampleFunc(theano_sample_func, trg_vocab) # TODO: move stream creation to nn_imt.stream # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000, # unk_id=1, bos_token=None): src_stream = get_textfile_stream( source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'], src_vocab_size=exp_config['src_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') trg_stream = get_textfile_stream( source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'], src_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') # text file stream training_stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long (Note this may break) training_stream = Filter( training_stream, predicate=_too_long(seq_len=exp_config['seq_len'])) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? training_stream = Mapping( training_stream, _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'], trg_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'])) # add in the prefix and suffix seqs # working: add the sample ratio logger.info('Sample ratio is: {}'.format(exp_config.get( 'sample_ratio', 1.))) training_stream = Mapping( training_stream, PrefixSuffixStreamTransformer( sample_ratio=exp_config.get('sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) training_stream = Mapping( training_stream, CopySourceAndTargetToMatchPrefixes(training_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten training_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) training_stream = Unpack(training_stream) # METEOR trg_ivocab = {v: k for k, v in trg_vocab.items()} # TODO: Implement smoothed BLEU # TODO: Implement first-word accuracy (bilingual language model) min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu') if min_risk_score_func == 'meteor': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_meteor, num_samples=exp_config['n_samples'], trg_ivocab=trg_ivocab, lang=exp_config['target_lang'], meteor_directory=exp_config['meteor_directory']) elif min_risk_score_func == 'imt_f1': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_imt_f1, num_samples=exp_config['n_samples']) # BLEU is default else: sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_bleu, num_samples=exp_config['n_samples']) training_stream = Mapping(training_stream, sampling_transformer, add_sources=('samples', 'seq_probs', 'scores')) # now filter out segments whose samples are too good or too bad training_stream = Filter(training_stream, predicate=filter_by_sample_score) # Now make a very big batch that we can shuffle # Build a batched version of stream to read k batches ahead shuffle_batch_size = exp_config['shuffle_batch_size'] training_stream = Batch( training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) training_stream = ShuffleBatchTransformer(training_stream) # unpack it again training_stream = Unpack(training_stream) # Build a batched version of stream to read k batches ahead batch_size = exp_config['batch_size'] sort_k_batches = exp_config['sort_k_batches'] training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch training_stream = Mapping(training_stream, SortMapping(_length)) # Convert it into a stream again training_stream = Unpack(training_stream) # Construct batches from the stream with specified batch size training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size)) # IDEA: add a transformer which flattens the target samples before we add the mask flat_sample_stream = FlattenSamples(training_stream) expanded_source_stream = CopySourceAndPrefixNTimes( flat_sample_stream, n_samples=exp_config['n_samples']) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? # Note: we shouldn't need to pad the seq_probs because there is only one per sequence # TODO: DEVELOPMENT HACK exp_config['suffix_length'] = 1 exp_config['truncate_sources'] = ['target_suffix'] configurable_padding_args = { 'suffix_length': exp_config.get('suffix_length', None), 'truncate_sources': exp_config.get('truncate_sources', []) } import ipdb ipdb.set_trace() masked_stream = PaddingWithEOS(expanded_source_stream, [ src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1 ], mask_sources=('source', 'target', 'target_prefix', 'target_suffix', 'samples'), **configurable_padding_args) return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
'/home/andrewsm/SEER/external/CoNLL2003/ner/eng.testb', ] # 748Kb file else: data_paths = [ '/home/andrewsm/SEER/external/CoNLL2003/ner/eng.train', ] # 3.3Mb file ## Achieved result: 50-epochs (GPU) training on eng.train => testb overall scores : ## accuracy: 96.42%; precision: 76.95%; recall: 80.26%; FB1: 78.57 dataset = CoNLLTextFile(data_paths, dictionary=word2code, unknown_token='<UNK>') data_stream = DataStream(dataset) data_stream = Filter(data_stream, _filter_long) #data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(mini_batch_size)) #data_stream = Padding(data_stream, mask_sources=('tokens')) # Adds a mask fields to this stream field, type='floatX' data_stream = Padding( data_stream, ) # Adds a mask fields to all of this stream's fields, type='floatX' data_stream = Mapping( data_stream, _transpose ) # Flips stream so that sentences run down columns, batches along rows (strangely) if False: # print sample for debugging Dataset / DataStream component #t=0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, bos_token=None, **kwargs): """Prepares the training data stream.""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') trg_dataset = TextFile([trg_data], trg_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000) stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream, src_vocab, trg_vocab
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None, is_training=True, src_vocabs=None, trg_vocabs=None, logprob_datasets=None): eid, did = p_(cg) if is_training: logger.info(' ... src:[{}] - [{}]'.format(eid, src_datasets[cg].files[0])) logger.info(' ... trg:[{}] - [{}]'.format(did, trg_datasets[cg].files[0])) stream = Merge([ src_datasets[cg].get_example_stream(), trg_datasets[cg].get_example_stream() ], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0: stream = Filter(stream, predicate=_too_short(config['min_seq_lens'][cg])) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) stream = Batch( stream, iteration_scheme=ConstantScheme(config['batch_sizes'][cg] * config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg])) else: # logprob stream src_dataset = TextFile([logprob_datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([logprob_datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_tr_stream_with_topic_target(src_vocab, trg_vocab,topic_vocab_input,topic_vocab_output, src_data, trg_data,topical_data, src_vocab_size=30000, trg_vocab_size=30000,trg_topic_vocab_size=2000,source_topic_vocab_size=2000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb')); topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it topic_binary_vocab={}; for k,v in topic_vocab_output.items(): if k=='<UNK>': topic_binary_vocab[k]=0; else: topic_binary_vocab[k]=1; # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) src_topic_input=TextFile([topical_data],topic_vocab_input,None,None,'rt') trg_topic_dataset = TextFile([trg_data],topic_vocab_output,None); trg_topic_binary_dataset= TextFile([trg_data],topic_binary_vocab,None); # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream(), src_topic_input.get_example_stream(), trg_topic_dataset.get_example_stream(), trg_topic_binary_dataset.get_example_stream()], ('source', 'target','source_topical','target_topic','target_binary_topic')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # The topical part are not contained of it, check~ stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, src_topic_vocab_size=source_topic_vocab_size, trg_topic_vocab_size=trg_topic_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1]) return masked_stream
def test_data_stream_filter(): data = [1, 2, 3] data_filtered = [1, 3] stream = DataStream(IterableDataset(data)) wrapper = Filter(stream, lambda d: d[0] % 2 == 1) assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))
def get_tr_stream(src_vocab, trg_vocab, src_files, trg_files, encoding='UTF-8', preprocess=to_lower_case, src_vocab_size=30000, trg_vocab_size=30000, eos='</S>', eos_id=0, unk='<UNK>', unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" src_dataset = TextFile(src_files, src_vocab, preprocess=preprocess, bos_token=None, eos_token=eos, unk_token=unk, encoding=encoding) trg_dataset = TextFile(trg_files, trg_vocab, preprocess=preprocess, bos_token=None, eos_token=eos, unk_token=unk, encoding=encoding) src_data_stream = DataStream(src_dataset) trg_data_stream = DataStream(trg_dataset) # Replace out of vocabulary tokens with unk token if src_vocab_size < len(src_vocab): src_data_stream = Mapping( src_data_stream, _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id)) if trg_vocab_size < len(trg_vocab): trg_data_stream = Mapping( trg_data_stream, _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id)) # Merge them to get a source, target pair stream = Merge([src_data_stream, trg_data_stream], ('source', 'target')) # Filter sequences that are too long (either source or target) stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length(target_source_index=1))) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short stream = _PaddingWithToken(stream, eos_id) # Attach one-hot ground truth data stream stream = Mapping(stream, _to_one_hot(target_source_index=2, vacabuary_size=trg_vocab_size), add_sources=("one_hot_ground_truth", )) return stream
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)