def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def setup_model_and_stream(exp_config, source_vocab, target_vocab): # TODO: this line is a mess sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \ train_decoder, generated = \ get_sampling_model_and_input(exp_config) trg_vocab = target_vocab trg_vocab_size = exp_config['trg_vocab_size'] src_vocab = source_vocab src_vocab_size = exp_config['src_vocab_size'] theano_sample_func = sample_model.get_theano_function() sampling_func = SampleFunc(theano_sample_func, trg_vocab) # TODO: move stream creation to nn_imt.stream # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000, # unk_id=1, bos_token=None): src_stream = get_textfile_stream( source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'], src_vocab_size=exp_config['src_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') trg_stream = get_textfile_stream( source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'], src_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') # text file stream training_stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long (Note this may break) training_stream = Filter( training_stream, predicate=_too_long(seq_len=exp_config['seq_len'])) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? training_stream = Mapping( training_stream, _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'], trg_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'])) # add in the prefix and suffix seqs # working: add the sample ratio logger.info('Sample ratio is: {}'.format(exp_config.get( 'sample_ratio', 1.))) training_stream = Mapping( training_stream, PrefixSuffixStreamTransformer( sample_ratio=exp_config.get('sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) training_stream = Mapping( training_stream, CopySourceAndTargetToMatchPrefixes(training_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten training_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) training_stream = Unpack(training_stream) # METEOR trg_ivocab = {v: k for k, v in trg_vocab.items()} # TODO: Implement smoothed BLEU # TODO: Implement first-word accuracy (bilingual language model) min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu') if min_risk_score_func == 'meteor': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_meteor, num_samples=exp_config['n_samples'], trg_ivocab=trg_ivocab, lang=exp_config['target_lang'], meteor_directory=exp_config['meteor_directory']) elif min_risk_score_func == 'imt_f1': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_imt_f1, num_samples=exp_config['n_samples']) # BLEU is default else: sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_bleu, num_samples=exp_config['n_samples']) training_stream = Mapping(training_stream, sampling_transformer, add_sources=('samples', 'seq_probs', 'scores')) # now filter out segments whose samples are too good or too bad training_stream = Filter(training_stream, predicate=filter_by_sample_score) # Now make a very big batch that we can shuffle # Build a batched version of stream to read k batches ahead shuffle_batch_size = exp_config['shuffle_batch_size'] training_stream = Batch( training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) training_stream = ShuffleBatchTransformer(training_stream) # unpack it again training_stream = Unpack(training_stream) # Build a batched version of stream to read k batches ahead batch_size = exp_config['batch_size'] sort_k_batches = exp_config['sort_k_batches'] training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch training_stream = Mapping(training_stream, SortMapping(_length)) # Convert it into a stream again training_stream = Unpack(training_stream) # Construct batches from the stream with specified batch size training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size)) # IDEA: add a transformer which flattens the target samples before we add the mask flat_sample_stream = FlattenSamples(training_stream) expanded_source_stream = CopySourceAndPrefixNTimes( flat_sample_stream, n_samples=exp_config['n_samples']) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? # Note: we shouldn't need to pad the seq_probs because there is only one per sequence # TODO: DEVELOPMENT HACK exp_config['suffix_length'] = 1 exp_config['truncate_sources'] = ['target_suffix'] configurable_padding_args = { 'suffix_length': exp_config.get('suffix_length', None), 'truncate_sources': exp_config.get('truncate_sources', []) } import ipdb ipdb.set_trace() masked_stream = PaddingWithEOS(expanded_source_stream, [ src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1 ], mask_sources=('source', 'target', 'target_prefix', 'target_suffix', 'samples'), **configurable_padding_args) return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
sampling_func = SampleFunc(theano_sample_func, trg_vocab) src_stream = get_textfile_stream(source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'], src_vocab_size=exp_config['src_vocab_size']) trg_stream = get_textfile_stream(source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'], src_vocab_size=exp_config['trg_vocab_size']) # Merge them to get a source, target pair training_stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long training_stream = Filter(training_stream, predicate=_too_long(seq_len=exp_config['seq_len'])) # TODO: configure min-risk score func from the yaml config # BLEU # sampling_transformer = MTSampleStreamTransformer(sampling_func, # sentence_level_bleu, # num_samples=exp_config['n_samples']) # METEOR trg_ivocab = {v: k for k, v in trg_vocab.items()} # WORKING: pass kwargs through the SampleStreamTransformer to the scoring function sampling_transformer = MTSampleStreamTransformer( sampling_func, sentence_level_meteor,
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) train_features = _get_np_array(context_features) train_feature_dataset = IterableDataset(train_features) train_image_stream = DataStream(train_feature_dataset) stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context')) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target')) return masked_stream, src_vocab, trg_vocab
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the IMT training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # TODO: should training stream actually have begin and end tokens? # Note: this actually depends upon how the system was pre-trained, but systems used for initialization # Note: should _always_ have BOS tokens # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') trg_dataset = TextFile([trg_data], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) stream = Unpack(stream) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs['shuffle_batch_size'] stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size) ) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches) ) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? configurable_padding_args = { 'suffix_length': kwargs.get('suffix_length', None), 'truncate_sources': kwargs.get('truncate_sources', []) } logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length'])) logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources'])) masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args) return masked_stream, src_vocab, trg_vocab