Esempio n. 1
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 2
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 3
0
def get_sgnmt_tr_stream(src_data, trg_data,
                       src_vocab_size=30000, trg_vocab_size=30000,
                       unk_id=1, seq_len=50, batch_size=80, 
                       sort_k_batches=12, **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size,
                               trg_vocab_size=trg_vocab_size,
                               unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 4
0
def get_sgnmt_shuffled_tr_stream(src_data, trg_data,
                                src_vocab_size=30000, trg_vocab_size=30000,
                                unk_id=1, seq_len=50, batch_size=80, 
                                sort_k_batches=12, **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data,
                                        src_vocab, trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size,
                               trg_vocab_size=trg_vocab_size,
                               unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 5
0
def setup_model_and_stream(exp_config, source_vocab, target_vocab):

    # TODO: this line is a mess
    sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \
    train_decoder, generated = \
        get_sampling_model_and_input(exp_config)

    trg_vocab = target_vocab
    trg_vocab_size = exp_config['trg_vocab_size']
    src_vocab = source_vocab
    src_vocab_size = exp_config['src_vocab_size']

    theano_sample_func = sample_model.get_theano_function()
    sampling_func = SampleFunc(theano_sample_func, trg_vocab)

    # TODO: move stream creation to nn_imt.stream
    # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000,
    #                         unk_id=1, bos_token=None):
    src_stream = get_textfile_stream(
        source_file=exp_config['src_data'],
        src_vocab=exp_config['src_vocab'],
        src_vocab_size=exp_config['src_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    trg_stream = get_textfile_stream(
        source_file=exp_config['trg_data'],
        src_vocab=exp_config['trg_vocab'],
        src_vocab_size=exp_config['trg_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    # text file stream
    training_stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long (Note this may break)
    training_stream = Filter(
        training_stream, predicate=_too_long(seq_len=exp_config['seq_len']))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    training_stream = Mapping(
        training_stream,
        _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'],
                    trg_vocab_size=exp_config['trg_vocab_size'],
                    unk_id=exp_config['unk_id']))

    # add in the prefix and suffix seqs
    # working: add the sample ratio
    logger.info('Sample ratio is: {}'.format(exp_config.get(
        'sample_ratio', 1.)))
    training_stream = Mapping(
        training_stream,
        PrefixSuffixStreamTransformer(
            sample_ratio=exp_config.get('sample_ratio', 1.)),
        add_sources=('target_prefix', 'target_suffix'))

    training_stream = Mapping(
        training_stream, CopySourceAndTargetToMatchPrefixes(training_stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    training_stream.produces_examples = False

    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    training_stream = Unpack(training_stream)

    # METEOR
    trg_ivocab = {v: k for k, v in trg_vocab.items()}

    # TODO: Implement smoothed BLEU
    # TODO: Implement first-word accuracy (bilingual language model)

    min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu')

    if min_risk_score_func == 'meteor':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_meteor,
            num_samples=exp_config['n_samples'],
            trg_ivocab=trg_ivocab,
            lang=exp_config['target_lang'],
            meteor_directory=exp_config['meteor_directory'])
    elif min_risk_score_func == 'imt_f1':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_imt_f1,
            num_samples=exp_config['n_samples'])
    # BLEU is default
    else:
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_bleu,
            num_samples=exp_config['n_samples'])

    training_stream = Mapping(training_stream,
                              sampling_transformer,
                              add_sources=('samples', 'seq_probs', 'scores'))

    # now filter out segments whose samples are too good or too bad
    training_stream = Filter(training_stream, predicate=filter_by_sample_score)

    # Now make a very big batch that we can shuffle
    # Build a batched version of stream to read k batches ahead
    shuffle_batch_size = exp_config['shuffle_batch_size']
    training_stream = Batch(
        training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    training_stream = ShuffleBatchTransformer(training_stream)

    # unpack it again
    training_stream = Unpack(training_stream)

    # Build a batched version of stream to read k batches ahead
    batch_size = exp_config['batch_size']
    sort_k_batches = exp_config['sort_k_batches']
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size *
                                                            sort_k_batches))

    # Sort all samples in the read-ahead batch
    training_stream = Mapping(training_stream, SortMapping(_length))

    # Convert it into a stream again
    training_stream = Unpack(training_stream)

    # Construct batches from the stream with specified batch size
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size))

    # IDEA: add a transformer which flattens the target samples before we add the mask
    flat_sample_stream = FlattenSamples(training_stream)

    expanded_source_stream = CopySourceAndPrefixNTimes(
        flat_sample_stream, n_samples=exp_config['n_samples'])

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    # Note: we shouldn't need to pad the seq_probs because there is only one per sequence
    # TODO: DEVELOPMENT HACK
    exp_config['suffix_length'] = 1
    exp_config['truncate_sources'] = ['target_suffix']
    configurable_padding_args = {
        'suffix_length': exp_config.get('suffix_length', None),
        'truncate_sources': exp_config.get('truncate_sources', [])
    }
    import ipdb
    ipdb.set_trace()
    masked_stream = PaddingWithEOS(expanded_source_stream, [
        src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1,
        trg_vocab_size - 1, trg_vocab_size - 1
    ],
                                   mask_sources=('source', 'target',
                                                 'target_prefix',
                                                 'target_suffix', 'samples'),
                                   **configurable_padding_args)

    return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
Esempio n. 6
0
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features,
                                        src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                                        seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))


    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # now add the source with the image features
    # create the image datastream (iterate over a file line-by-line)
    train_features = _get_np_array(context_features)
    train_feature_dataset = IterableDataset(train_features)
    train_image_stream = DataStream(train_feature_dataset)

    stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context'))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target'))

    return masked_stream, src_vocab, trg_vocab
Esempio n. 7
0
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000,
                                trg_vocab_size=30000, unk_id=1, seq_len=50,
                                batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the IMT training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # TODO: should training stream actually have begin and end tokens?
    # Note: this actually depends upon how the system was pre-trained, but systems used for initialization
    # Note: should _always_ have BOS tokens

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')
    trg_dataset = TextFile([trg_data], trg_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)),
                     add_sources=('target_prefix', 'target_suffix'))

    stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    stream.produces_examples = False
    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    stream = Unpack(stream)

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs['shuffle_batch_size']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(shuffle_batch_size)
                   )

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size * sort_k_batches)
                   )

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    configurable_padding_args = {
        'suffix_length': kwargs.get('suffix_length', None),
        'truncate_sources': kwargs.get('truncate_sources', [])
    }
    logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length']))
    logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources']))
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1],
        mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args)

    return masked_stream, src_vocab, trg_vocab