Esempio n. 1
0
    def __init__(self, exp_config):

        self.beam_search, self.sampling_input = load_params_and_get_beam_search(
            exp_config)

        self.exp_config = exp_config
        # how many hyps should be output (only used in file prediction mode)
        self.n_best = exp_config.get('n_best', 1)

        self.source_lang = exp_config.get('source_lang', 'en')
        self.target_lang = exp_config.get('target_lang', 'es')

        tokenize_script = exp_config.get('tokenize_script', None)
        detokenize_script = exp_config.get('detokenize_script', None)
        if tokenize_script is not None and detokenize_script is not None:
            self.tokenizer_cmd = [
                tokenize_script, '-l', self.source_lang, '-q', '-',
                '-no-escape', '1'
            ]
            self.detokenizer_cmd = [
                detokenize_script, '-l', self.target_lang, '-q', '-'
            ]
        else:
            self.tokenizer_cmd = None
            self.detokenizer_cmd = None

        # this index will get overwritten with the EOS token by _ensure_special_tokens
        # IMPORTANT: the index must be created in the same way it was for training,
        # otherwise the predicted indices will be nonsense
        # Make sure that src_vocab_size and trg_vocab_size are correct in your configuration
        self.src_eos_idx = exp_config['src_vocab_size'] - 1
        self.trg_eos_idx = exp_config['trg_vocab_size'] - 1

        self.unk_idx = exp_config['unk_id']

        # Get vocabularies and inverse indices
        # Note: _ensure_special_tokens will _overwrite_ anything at bos_idx, src_idx, and unk_idx
        self.src_vocab = _ensure_special_tokens(pickle.load(
            open(exp_config['src_vocab'])),
                                                bos_idx=0,
                                                eos_idx=self.src_eos_idx,
                                                unk_idx=self.unk_idx)
        self.src_ivocab = {v: k for k, v in self.src_vocab.items()}
        self.trg_vocab = _ensure_special_tokens(pickle.load(
            open(exp_config['trg_vocab'])),
                                                bos_idx=0,
                                                eos_idx=self.trg_eos_idx,
                                                unk_idx=self.unk_idx)
        self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}

        self.unk_idx = self.unk_idx
Esempio n. 2
0
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None,
                                         src_vocab_size=30000, unk_id=1, **kwargs):
    """Setup development set stream if necessary."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']


    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)

        # now add the source with the image features
        # create the image datastream (iterate over a file line-by-line)
        con_features = _get_np_array(val_context_features)
        con_feature_dataset = IterableDataset(con_features)
        valid_image_stream = DataStream(con_feature_dataset)

        # dev_stream = DataStream(dev_dataset)
        dev_stream = Merge([dev_dataset.get_example_stream(),
                            valid_image_stream], ('source', 'initial_context'))
    #         dev_stream = dev_stream.get_example_stream()

    return dev_stream
Esempio n. 3
0
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None,
                                    src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1,
                                    return_vocab=False, **kwargs):
    """Setup development stream with user-provided source, target, prefixes, and suffixes"""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        # Note: user should have already provided the EOS token in the data representation for the suffix
        # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics
        # Note: which count prediction of the </S> token, and evaluation scripts are called on the files
        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab,
                                      bos_token='<S>',
                                      eos_token=None,
                                      unk_token='<UNK>')
        dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab,
                                      bos_token=None,
                                      eos_token=None,
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream(),
                            dev_prefix_dataset.get_example_stream(),
                            dev_suffix_dataset.get_example_stream()],
                           ('source', 'target','target_prefix','target_suffix'))

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Esempio n. 4
0
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000,
                                 trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream()],
                           ('source', 'target'))

        # now add prefix and suffixes to this stream
        dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)),
                             add_sources=('target_prefix', 'target_suffix'))

        dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream))

        # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
        dev_stream.produces_examples = False
        # flatten the stream back out into (source, target, target_prefix, target_suffix)
        dev_stream = Unpack(dev_stream)

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Esempio n. 5
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(
            config['test_set'], config['src_vocab'],
            config['src_vocab_size'], config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(
            pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(
                line[0], config['src_vocab_size'], unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Esempio n. 6
0
    logger.info("Creating Sampling Model...")
    sampling_model = Model(generated)

    # TODO: update clients with sampling_context_input
    return sampling_model, sampling_source_input, sampling_context_input, encoder, decoder

sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, train_decoder = \
    get_sampling_model_and_input(exp_config)

trg_vocab = cPickle.load(open(exp_config['trg_vocab']))
trg_vocab_size = exp_config['trg_vocab_size'] - 1
src_vocab = cPickle.load(open(exp_config['src_vocab']))
src_vocab_size = exp_config['src_vocab_size'] - 1

src_vocab = _ensure_special_tokens(src_vocab,
                                   bos_idx=0,
                                   eos_idx=src_vocab_size,
                                   unk_idx=exp_config['unk_id'])
trg_vocab = _ensure_special_tokens(trg_vocab,
                                   bos_idx=0,
                                   eos_idx=trg_vocab_size,
                                   unk_idx=exp_config['unk_id'])

theano_sample_func = sample_model.get_theano_function()
sampling_func = SampleFunc(theano_sample_func, trg_vocab)

src_stream = get_textfile_stream(source_file=exp_config['src_data'],
                                 src_vocab=exp_config['src_vocab'],
                                 src_vocab_size=exp_config['src_vocab_size'])

trg_stream = get_textfile_stream(source_file=exp_config['trg_data'],
                                 src_vocab=exp_config['trg_vocab'],
Esempio n. 7
0
    def __init__(self, exp_config):

        theano_variables = load_params_and_get_beam_search(
            exp_config,
            brick_delimiter=exp_config.get('brick_delimiter', None))
        # beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars
        self.beam_search, search_model, samples, self.source_sampling_input, self.target_sampling_input = theano_variables

        self.exp_config = exp_config
        # how many hyps should be output (only used in file prediction mode)
        self.n_best = exp_config.get('n_best', 1)

        self.source_lang = exp_config.get('source_lang', 'en')
        self.target_lang = exp_config.get('target_lang', 'es')

        # persistent tokenizers and detokenizers
        tokenize_script = exp_config.get('tokenize_script', None)
        detokenize_script = exp_config.get('detokenize_script', None)
        if tokenize_script is not None and detokenize_script is not None:
            # Note: the '-b' option is _essential_ here, otherwise the tokenizer will hang forever
            self.source_tokenizer_cmd = [
                tokenize_script, '-l', self.source_lang, '-q', '-', '-b',
                '-no-escape', '1'
            ]
            self.target_tokenizer_cmd = [
                tokenize_script, '-l', self.target_lang, '-q', '-', '-b',
                '-no-escape', '1'
            ]
            self.detokenizer_cmd = [
                detokenize_script, '-l', self.target_lang, '-q', '-', '-b'
            ]

            self.source_tokenizer = Popen(self.source_tokenizer_cmd,
                                          stdin=PIPE,
                                          stdout=PIPE,
                                          bufsize=1)
            self.target_tokenizer = Popen(self.target_tokenizer_cmd,
                                          stdin=PIPE,
                                          stdout=PIPE,
                                          bufsize=1)
            self.target_detokenizer = Popen(self.detokenizer_cmd,
                                            stdin=PIPE,
                                            stdout=PIPE,
                                            bufsize=1)
        else:
            self.source_tokenizer_cmd = None
            self.target_tokenizer_cmd = None
            self.detokenizer_cmd = None

        # persistent subword encoding
        subword_codes = exp_config.get('subword_codes', None)
        self.BPE = None
        if subword_codes is not None:
            from lib.apply_bpe import BPE
            self.BPE = BPE(codecs.open(subword_codes, encoding='utf8'))

        # the maximum length of predictions -- this can be shortened to make prediction more efficient
        self.max_length = exp_config.get('n_steps', None)

        # this index will get overwritten with the EOS token by _ensure_special_tokens
        # IMPORTANT: the index must be created in the same way it was for training,
        # otherwise the predicted indices will be nonsense
        # Make sure that src_vocab_size and trg_vocab_size are correct in your configuration
        self.src_eos_idx = exp_config['src_vocab_size'] - 1
        self.trg_eos_idx = exp_config['trg_vocab_size'] - 1

        self.unk_idx = exp_config['unk_id']

        # Get vocabularies and inverse indices
        self.src_vocab = _ensure_special_tokens(pickle.load(
            open(exp_config['src_vocab'])),
                                                bos_idx=0,
                                                eos_idx=self.src_eos_idx,
                                                unk_idx=self.unk_idx)
        self.src_ivocab = {v: k for k, v in self.src_vocab.items()}
        self.trg_vocab = _ensure_special_tokens(pickle.load(
            open(exp_config['trg_vocab'])),
                                                bos_idx=0,
                                                eos_idx=self.trg_eos_idx,
                                                unk_idx=self.unk_idx)
        self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}

        self.unk_idx = self.unk_idx
Esempio n. 8
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(sampling_input,
                                         sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(bricks=[decoder.sequence_generator],
                                        name="outputs")(ComputationGraph(
                                            generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model,
                        data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input,
                              samples=samples,
                              config=config,
                              model=search_model,
                              data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(config['test_set'], config['src_vocab'],
                                     config['src_vocab_size'],
                                     config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(pickle.load(
            open(config['trg_vocab'])),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Esempio n. 9
0
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features,
                                        src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                                        seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))


    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # now add the source with the image features
    # create the image datastream (iterate over a file line-by-line)
    train_features = _get_np_array(context_features)
    train_feature_dataset = IterableDataset(train_features)
    train_image_stream = DataStream(train_feature_dataset)

    stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context'))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target'))

    return masked_stream, src_vocab, trg_vocab
Esempio n. 10
0
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000,
                                trg_vocab_size=30000, unk_id=1, seq_len=50,
                                batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the IMT training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # TODO: should training stream actually have begin and end tokens?
    # Note: this actually depends upon how the system was pre-trained, but systems used for initialization
    # Note: should _always_ have BOS tokens

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')
    trg_dataset = TextFile([trg_data], trg_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)),
                     add_sources=('target_prefix', 'target_suffix'))

    stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    stream.produces_examples = False
    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    stream = Unpack(stream)

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs['shuffle_batch_size']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(shuffle_batch_size)
                   )

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size * sort_k_batches)
                   )

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    configurable_padding_args = {
        'suffix_length': kwargs.get('suffix_length', None),
        'truncate_sources': kwargs.get('truncate_sources', [])
    }
    logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length']))
    logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources']))
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1],
        mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args)

    return masked_stream, src_vocab, trg_vocab