Ejemplo n.º 1
0
def test_beam_search():
    """Test beam search using the model from the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    reverser = WordReverser(10, alphabet_size)
    reverser.weights_init = reverser.biases_init = IsotropicGaussian(0.5)
    reverser.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(bricks=[reverser.generator], name="outputs")(
        ComputationGraph(reverser.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)),
                            (beam_size, 1)).T

    search = BeamSearch(10, samples)
    results, mask, costs = search.search({inputs: input_vals},
                                         0, 3 * length)

    true_costs = reverser.cost(
        input_vals, numpy.ones((length, beam_size), dtype=floatX),
        results, mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs, true_costs, rtol=1e-5)
Ejemplo n.º 2
0
def test_beam_search():
    """Test beam search using the model similar to the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    simple_generator = SimpleGenerator(10, alphabet_size, seed=1234)
    simple_generator.weights_init = IsotropicGaussian(0.5)
    simple_generator.biases_init = IsotropicGaussian(0.5)
    simple_generator.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(
        applications=[simple_generator.generator.generate],
        name="outputs")(ComputationGraph(simple_generator.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length, )),
                            (beam_size, 1)).T

    search = BeamSearch(samples)
    results, mask, costs = search.search({inputs: input_vals},
                                         0,
                                         3 * length,
                                         as_arrays=True)
    # Just check sum
    assert results.sum() == 2816

    true_costs = simple_generator.cost(
        input_vals, numpy.ones((length, beam_size),
                               dtype=theano.config.floatX), results,
        mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5)

    # Test `as_lists=True`
    results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length)
    for i in range(len(results2)):
        assert results2[i] == list(results.T[i, :mask.T[i].sum()])
Ejemplo n.º 3
0
def test_beam_search():
    """Test beam search using the model similar to the reverse_words demo.

    Ideally this test should be done with a trained model, but so far
    only with a randomly initialized one. So it does not really test
    the ability to find the best output sequence, but only correctness
    of returned costs.

    """
    rng = numpy.random.RandomState(1234)
    alphabet_size = 20
    beam_size = 10
    length = 15

    simple_generator = SimpleGenerator(10, alphabet_size, seed=1234)
    simple_generator.weights_init = IsotropicGaussian(0.5)
    simple_generator.biases_init = IsotropicGaussian(0.5)
    simple_generator.initialize()

    inputs = tensor.lmatrix('inputs')
    samples, = VariableFilter(
            applications=[simple_generator.generator.generate],
            name="outputs")(
        ComputationGraph(simple_generator.generate(inputs)))

    input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)),
                            (beam_size, 1)).T

    search = BeamSearch(samples)
    results, mask, costs = search.search(
        {inputs: input_vals}, 0, 3 * length, as_arrays=True)
    # Just check sum
    assert results.sum() == 2816

    true_costs = simple_generator.cost(
        input_vals, numpy.ones((length, beam_size),
                               dtype=theano.config.floatX),
        results, mask).eval()
    true_costs = (true_costs * mask).sum(axis=0)
    assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5)

    # Test `as_lists=True`
    results2, costs2 = search.search({inputs: input_vals},
                                     0, 3 * length)
    for i in range(len(results2)):
        assert results2[i] == list(results.T[i, :mask.T[i].sum()])
Ejemplo n.º 4
0
        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, _, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                costs = costs.T

            outputs = list(outputs.T)
            costs = list(costs)
            for i in range(len(outputs)):
                outputs[i] = list(outputs[i])
                try:
                    true_length = outputs[i].index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(outputs[i])
                outputs[i] = outputs[i][:true_length]
                if mode == "sample":
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs
Ejemplo n.º 5
0
        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs
Ejemplo n.º 6
0
    unk_idx = config['unk_id']
    src_eos_idx = config['src_vocab_size'] - 1
    trg_eos_idx = config['trg_vocab_size'] - 1

    ftrans = open('/Users/lqy/Documents/transout.txt','w',0)

    falign = gzip.open('/Users/lqy/Documents/alignmentout','w',0)

        
    for i, line in enumerate(validate_stream.get_epoch_iterator()):
        source_line = line[0]
        #line_tok = mergeSplit(source_token[i])
        seq = nmt._oov_to_unk(line[0], config['src_vocab_size'], unk_idx)
        input_ = numpy.tile(seq, (config['beam_size'], 1)) #产生12 行1列的元素矩阵,元素指的是一个的序列
        #print "input_: ",input_[3]
        trans,costs = beam_search.search(input_values={source_sentence: input_[:]},max_length=3*len(seq), eol_symbol=src_eos_idx,ignore_first_eol=True)

        lengths = numpy.array([len(s) for s in trans])
        costs = costs / lengths

        best = numpy.argsort(costs)[0]

        trans_out = trans[best]

        source_word = nmt._idx_to_word(line[0],nmt.src_ivocab)
        trans_out_word = nmt._idx_to_word(trans_out, nmt.trg_ivocab)
        trans_out_word_str = trans_out_word.split(" ")
        source_word_str = source_word.split(" ")

        alignment = numpy.asarray(getAlignment(numpy.array(source_line)[None, :],numpy.array(trans_out)[None, :]))
Ejemplo n.º 7
0
class IMT_F1_Validator(SimpleExtension, SamplingBase):
    """Implements early stopping based on METEOR score."""
    def __init__(self,
                 source_sentence,
                 target_prefix,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(IMT_F1_Validator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.target_prefix = target_prefix

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_imt_f1_curve = []
        self.beam_search = BeamSearch(samples=samples)

        # Info for Meteor
        self.target_language = self.config['target_lang']

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                imt_f1_score = numpy.load(
                    os.path.join(self.config['saveto'],
                                 'val_imt_f1_scores.npz'))
                self.val_imt_f1_curve = imt_f1_score['imt_f1_scores'].tolist()

                # Track n best previous f1_bad scores
                for i, imt_f1_val in enumerate(
                        sorted(self.val_imt_f1_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(
                            ModelInfo(imt_f1_val, key='IMT_F1'))
                logger.info("IMT_F1_Scores Reloaded")
            except:
                logger.info("IMT_F1_Scores not found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config[
                'val_burn_in']:
            return

        # Evaluate the model
        imt_f1_score = self._evaluate_model()
        # add an entry to the log
        self.main_loop.log.current_row[
            'validation_set_imt_f1_score'] = imt_f1_score
        # save if necessary
        self._save_model(imt_f1_score)

    # TODO: if we are evaluating both BLEU and METEOR, we shouldn't need to translate twice!!
    def _evaluate_model(self):
        # Set in the superclass -- SamplingBase
        if not hasattr(self, 'target_dataset'):
            self._initialize_dataset_info()

        self.unk_sym = '<UNK>'
        self.eos_sym = '</S>'
        self.unk_idx = self.trg_vocab[self.unk_sym]
        self.eos_idx = self.trg_vocab[self.eos_sym]

        logger.info("Started Validation: ")
        val_start_time = time.time()

        ref_file = self.config['val_set_grndtruth']

        trg_hyp_file = tempfile.NamedTemporaryFile(delete=False)

        if self.verbose:
            ftrans = codecs.open(self.config['val_set_out'],
                                 'w',
                                 encoding='utf8')

        total_cost = 0.0
        with codecs.open(trg_hyp_file.name, 'w', encoding='utf8') as hyps_out:
            for i, line in enumerate(self.data_stream.get_epoch_iterator()):
                """
                Load the sentence, retrieve the sample, write to file
                """

                # TODO: the section with beam search and translation is shared by all validators
                # WORKING: switch this to IMT prefix validation
                # Note that the indices of source and target in the datastream are hard-coded
                # currently our datastream is (source,target,prefix,suffix)
                seq = self._oov_to_unk(line[0], self.config['src_vocab_size'],
                                       self.unk_idx)

                target_prefix = line[2]

                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
                prefix_input_ = numpy.tile(target_prefix,
                                           (self.config['beam_size'], 1))

                # draw sample, checking to ensure we don't get an empty string back
                # beam search param names come from WHERE??
                trans, costs = self.beam_search.search(input_values={
                    self.source_sentence:
                    input_,
                    self.target_prefix:
                    prefix_input_
                },
                                                       max_length=3 * len(seq),
                                                       eol_symbol=self.eos_idx,
                                                       ignore_first_eol=False)

                # normalize costs according to the sequence lengths
                if self.normalize:
                    lengths = numpy.array([len(s) for s in trans])
                    costs = costs / lengths

                nbest_idx = numpy.argsort(costs)[:self.n_best]
                for j, best in enumerate(nbest_idx):
                    try:
                        total_cost += costs[best]
                        trans_out = trans[best]

                        # convert idx to words
                        trans_out = self._idx_to_word(trans_out,
                                                      self.trg_ivocab)

                    except ValueError:
                        logger.info(
                            "Can NOT find a translation for line: {}".format(
                                i + 1))
                        trans_out = '<UNK>'

                    if j == 0:
                        # Write to subprocess and file if it exists
                        hyps_out.write(trans_out.decode('utf8') + '\n')
                        if self.verbose:
                            print(trans_out.decode('utf8'), file=ftrans)

                if i != 0 and i % 100 == 0:
                    logger.info(
                        "Translated {} lines of validation set...".format(i))

            logger.info("Total cost of the validation: {}".format(total_cost))

            self.data_stream.reset()
            if self.verbose:
                ftrans.close()

        imt_f1_score, imt_precision, imt_recall = imt_f1_from_files(
            trg_hyp_file.name, ref_file)

        logger.info("IMT F1 Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        logger.info("IMT F1: {}, Precision: {}, Recall: {}".format(
            imt_f1_score, imt_precision, imt_recall))

        return imt_f1_score

    def _is_valid_to_save(self, imt_f1_score):
        if not self.best_models or min(
                self.best_models,
                key=operator.attrgetter('score')).score < imt_f1_score:
            return True
        return False

    def _save_model(self, imt_f1_score):
        if self._is_valid_to_save(imt_f1_score):
            model = ModelInfo(imt_f1_score,
                              self.config['saveto'],
                              key='IMT_F1')

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))

            SaveLoadUtils.save_parameter_values(
                self.main_loop.model.get_parameter_values(), model.path)
            numpy.savez(os.path.join(self.config['saveto'],
                                     'val_imt_f1_scores.npz'),
                        imt_f1_scores=self.val_imt_f1_curve)
            signal.signal(signal.SIGINT, s)
class BeamSearchEvaluator(object):
    def __init__(self, eol_symbol, beam_size, x, x_mask, samples,
                 phoneme_dict=None, black_list=None):
        if black_list is None:
            self.black_list = []
        else:
            self.black_list = black_list
        self.x = x
        self.x_mask = x_mask
        self.eol_symbol = eol_symbol
        self.beam_size = beam_size
        self.beam_search = BeamSearch(beam_size, samples)
        self.beam_search.compile()
        self.phoneme_dict = phoneme_dict

    def evaluate(self, data_stream, train=False, file_pred=None,
                 file_targets=None):
        loss = 0.
        num_examples = 0
        iterator = data_stream.get_epoch_iterator()
        if train:
            print 'Train evaluation started'
        i = 0
        for inputs in iterator:
            inputs = dict(zip(data_stream.sources, inputs))
            x_mask_val = inputs['features_mask']
            x_val = inputs['features']
            y_val = inputs['phonemes']
            y_mask_val = inputs['phonemes_mask']
            for batch_ind in xrange(inputs['features'].shape[1]):
                if x_val.ndim == 2:
                    input_beam = numpy.tile(x_val[:, batch_ind][:, None],
                        (1, self.beam_size))
                else:
                    input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :],
                                            (1, self.beam_size, 1))
                input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None],
                                             (1, self.beam_size))
                predictions, _ = self.beam_search.search(
                    {self.x: input_beam,
                     self.x_mask: input_mask_beam},
                    self.eol_symbol, 100)
                predictions = [self.phoneme_dict[phone_ind] for phone_ind
                             in predictions[0]
                             if self.phoneme_dict[phone_ind] not in
                             self.black_list][1:-1]

                targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind]
                targets = [self.phoneme_dict[phone_ind] for phone_ind
                             in targets
                             if self.phoneme_dict[phone_ind] not in
                             self.black_list][1:-1]
                predictions = [x[0] for x in groupby(predictions)]
                targets = [x[0] for x in groupby(targets)]
                i += 1
                if file_pred:
                    file_pred.write(' '.join(predictions) + '(%d)\n' % i)
                if file_targets:
                    file_targets.write(' '.join(targets) + '(%d)\n' %i)

                loss += Evaluation.wer([predictions], [targets])
                num_examples += 1

            print '.. found sequence example:', ' '.join(predictions)
            print '.. real output was:       ', ' '.join(targets)
            if train:
                break
        if train:
            print 'Train evaluation finished'
        per = loss.sum() / num_examples
        return {'per': per}
Ejemplo n.º 9
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(
            config['test_set'], config['src_vocab'],
            config['src_vocab_size'], config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(
            pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(
                line[0], config['src_vocab_size'], unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Ejemplo n.º 10
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """
    def __init__(
            self,
            input_dims,
            input_num_chars,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            lm=None,
            character_map=None,
            bidir=True,
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=True,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=1,
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_dims=input_dims,
                              input_num_chars=input_num_chars,
                              name='bottom',
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample,
                          bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        generators = [None, None]
        for i in range(2):
            # The top part, on top of BiRNN but before the attention
            if dims_top:
                top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                          name="top{}".format(i))
            else:
                top = Identity(name='top{}'.format(i))

            if dec_stack == 1:
                transition = self.dec_transition(dim=dim_dec,
                                                 activation=Tanh(),
                                                 name="transition{}".format(i))
            else:
                transitions = [
                    self.dec_transition(dim=dim_dec,
                                        activation=Tanh(),
                                        name="transition_{}_{}".format(
                                            i, trans_level))
                    for trans_level in xrange(dec_stack)
                ]
                transition = RecurrentStack(transitions=transitions,
                                            skip_connections=True)
            # Choose attention mechanism according to the configuration
            if attention_type == "content":
                attention = SequenceContentAttention(
                    state_names=transition.apply.states,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    name="cont_att" + i)
            elif attention_type == "content_and_conv":
                attention = SequenceContentAndConvAttention(
                    state_names=transition.apply.states,
                    conv_n=conv_n,
                    conv_num_filters=conv_num_filters,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    prior=prior,
                    energy_normalizer=energy_normalizer,
                    name="conv_att{}".format(i))
            else:
                raise ValueError(
                    "Unknown attention type {}".format(attention_type))
            if embed_outputs:
                feedback = LookupFeedback(
                    num_phonemes + 1, dim_dec
                    if dim_output_embedding is None else dim_output_embedding)
            else:
                feedback = OneOfNFeedback(num_phonemes + 1)
            if criterion['name'] == 'log_likelihood':
                emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                         name="emitter{}".format(i))
                if lm:
                    # In case we use LM it is Readout that is responsible
                    # for normalization.
                    emitter = LMEmitter()
            elif criterion['name'].startswith('mse'):
                emitter = RewardRegressionEmitter(criterion['name'],
                                                  eos_label,
                                                  num_phonemes,
                                                  criterion.get(
                                                      'min_reward', -1.0),
                                                  name="emitter")
            else:
                raise ValueError("Unknown criterion {}".format(
                    criterion['name']))
            readout_config = dict(
                readout_dim=num_phonemes,
                source_names=(transition.apply.states if use_states_for_readout
                              else []) + [attention.take_glimpses.outputs[0]],
                emitter=emitter,
                feedback_brick=feedback,
                name="readout{}".format(i))
            if post_merge_dims:
                readout_config['merged_dim'] = post_merge_dims[0]
                readout_config['post_merge'] = InitializableSequence(
                    [
                        Bias(post_merge_dims[0]).apply,
                        post_merge_activation.apply,
                        MLP(
                            [post_merge_activation] *
                            (len(post_merge_dims) - 1) + [Identity()],
                            # MLP was designed to support Maxout is activation
                            # (because Maxout in a way is not one). However
                            # a single layer Maxout network works with the trick below.
                            # For deeper Maxout network one has to use the
                            # Sequence brick.
                            [
                                d //
                                getattr(post_merge_activation, 'num_pieces', 1)
                                for d in post_merge_dims
                            ] + [num_phonemes]).apply,
                    ],
                    name='post_merge{}'.format(i))
            readout = Readout(**readout_config)

            language_model = None
            if lm and lm.get('path'):
                lm_weight = lm.pop('weight', 0.0)
                normalize_am_weights = lm.pop('normalize_am_weights', True)
                normalize_lm_weights = lm.pop('normalize_lm_weights', False)
                normalize_tot_weights = lm.pop('normalize_tot_weights', False)
                am_beta = lm.pop('am_beta', 1.0)
                if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                    logger.warn(
                        "Beam search is prone to fail with no log-prob normalization"
                    )
                language_model = LanguageModel(nn_char_map=character_map, **lm)
                readout = ShallowFusionReadout(
                    lm_costs_name='lm_add',
                    lm_weight=lm_weight,
                    normalize_am_weights=normalize_am_weights,
                    normalize_lm_weights=normalize_lm_weights,
                    normalize_tot_weights=normalize_tot_weights,
                    am_beta=am_beta,
                    **readout_config)

            generators[i] = SequenceGenerator(readout=readout,
                                              transition=transition,
                                              attention=attention,
                                              language_model=language_model,
                                              name="generator{}".format(i))

        self.generator = generators[0]

        self.forward_to_backward = Linear(dim_dec, dim_dec)

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generators = generators
        self.children = [self.forward_to_backward, encoder, top, bottom
                         ] + generators

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {
                'weights_init': self.rec_weights_init,
                'recurrent_weights_init': self.rec_weights_init
            }
            global_push_initialization_config(self, rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(
                self, {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, application_call, **kwargs):
        # pop inputs we know about
        inputs_mask = kwargs.pop('inputs_mask')
        labels = kwargs.pop('labels')
        labels_mask = kwargs.pop('labels_mask')

        # the rest is for bottom
        bottom_processed = self.bottom.apply(**kwargs)
        encoded, encoded_mask = self.encoder.apply(input_=bottom_processed,
                                                   mask=inputs_mask)
        encoded = self.top.apply(encoded)
        outs_forward = self.generators[0].evaluate(labels,
                                                   labels_mask,
                                                   attended=encoded,
                                                   attended_mask=encoded_mask)
        costs_forward, states_forward, _, _, _, _ = outs_forward
        outs_backward = self.generators[1].evaluate(
            labels[::-1],
            labels_mask[::-1] if labels_mask else None,
            attended=encoded[::-1],
            attended_mask=encoded_mask[::-1])
        costs_backward, states_backward, _, _, _, _ = outs_backward
        costs_backward = costs_backward[::-1]
        states_backward = states_backward[::-1]

        states_shape = states_forward.shape
        backward_predicted = self.forward_to_backward.apply(
            states_forward.reshape((states_shape[0] * states_shape[1], -1)))
        backward_predicted = backward_predicted.reshape(states_shape)
        backward_predicted = backward_predicted * labels_mask[:, :, None]

        states_backward = gradient.disconnected_grad(states_backward)
        states_backward = states_backward * labels_mask[:, :, None]
        l2_cost = ((backward_predicted - states_backward)**2).mean(axis=2)
        l2_cost.name = 'l2_cost_aux'
        application_call.add_auxiliary_variable(
            l2_cost.sum(axis=0).mean().copy(name='l2_cost_aux'))
        costs_forward_aux = (costs_forward.sum(axis=0).mean()).copy(
            name='costs_forward_aux')
        application_call.add_auxiliary_variable(costs_forward_aux)
        return costs_forward + costs_backward + 1.5 * l2_cost

    @application
    def generate(self, **kwargs):
        inputs_mask = kwargs.pop('inputs_mask')
        n_steps = kwargs.pop('n_steps')

        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(**kwargs), mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=n_steps if n_steps is not None else self.n_steps,
            batch_size=encoded.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        with open(path, 'r') as src:
            param_values = load_parameters(src)
        Model(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self, use_mask=True, n_steps=None):
        inputs_mask = None
        if use_mask:
            inputs_mask = self.inputs_mask
        bottom_inputs = self.inputs
        return self.generate(n_steps=n_steps,
                             inputs_mask=inputs_mask,
                             **bottom_inputs)

    def get_cost_graph(self,
                       batch=True,
                       prediction=None,
                       prediction_mask=None):

        if batch:
            inputs = self.inputs
            inputs_mask = self.inputs_mask
            groundtruth = self.labels
            groundtruth_mask = self.labels_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = None

        if not prediction:
            prediction = groundtruth
        if not prediction_mask:
            prediction_mask = groundtruth_mask

        cost = self.cost(inputs_mask=inputs_mask,
                         labels=prediction,
                         labels_mask=prediction_mask,
                         **inputs)
        cost_cg = ComputationGraph(cost)
        if self.criterion['name'].startswith("mse"):
            placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
            cost_cg = cost_cg.replace({placeholder: groundtruth})
        return cost_cg

    def analyze(self, inputs, groundtruth, prediction=None):
        """Compute cost and aligment."""

        input_values_dict = dict(inputs)
        input_values_dict['groundtruth'] = groundtruth
        if prediction is not None:
            input_values_dict['prediction'] = prediction
        if not hasattr(self, "_analyze"):
            input_variables = list(self.single_inputs.values())
            input_variables.append(self.single_labels.copy(name='groundtruth'))

            prediction_variable = tensor.lvector('prediction')
            if prediction is not None:
                input_variables.append(prediction_variable)
                cg = self.get_cost_graph(batch=False,
                                         prediction=prediction_variable[:,
                                                                        None])
            else:
                cg = self.get_cost_graph(batch=False)
            cost = cg.outputs[0]

            weights, = VariableFilter(bricks=[self.generator],
                                      name="weights")(cg)

            energies = VariableFilter(bricks=[self.generator],
                                      name="energies")(cg)
            energies_output = [
                energies[0][:,
                            0, :] if energies else tensor.zeros_like(weights)
            ]

            states, = VariableFilter(applications=[self.encoder.apply],
                                     roles=[OUTPUT],
                                     name="encoded")(cg)

            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]

            self._analyze = theano.function(
                input_variables, [cost[:, 0], weights[:, 0, :]] +
                energies_output + ctc_matrix_output,
                on_unused_input='warn')
        return self._analyze(**input_values_dict)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(applications=[self.generator.generate],
                                  name="outputs")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, inputs, **kwargs):
        # When a recognizer is unpickled, self.beam_size is available
        # but beam search has to be recompiled.

        self.init_beam_search(self.beam_size)
        inputs = dict(inputs)
        max_length = int(
            self.bottom.num_time_steps(**inputs) /
            self.max_decoded_length_scale)
        search_inputs = {}
        for var in self.inputs.values():
            search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...]
        if inputs:
            raise Exception('Unknown inputs passed to beam search: {}'.format(
                inputs.keys()))
        outputs, search_costs = self._beam_search.search(
            search_inputs,
            self.eos_label,
            max_length,
            ignore_first_eol=self.data_prepend_eos,
            **kwargs)
        return outputs, search_costs

    def init_generate(self):
        generated = self.get_generate_graph(use_mask=False)
        cg = ComputationGraph(generated['outputs'])
        self._do_generate = cg.get_theano_function()

    def sample(self, inputs, n_steps=None):
        if not hasattr(self, '_do_generate'):
            self.init_generate()
        batch, unused_mask = self.bottom.single_to_batch_inputs(inputs)
        batch['n_steps'] = n_steps if n_steps is not None \
            else int(self.bottom.num_time_steps(**batch) /
                     self.max_decoded_length_scale)
        return self._do_generate(**batch)[0]

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Ejemplo n.º 11
0
class BlocksNMTVanillaDecoder(Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding. Note that this decoder supports sparse feat maps
    on both source and target side.
    """
    
    def __init__(self, nmt_model_path, config, decoder_args):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
            decoder_args (object): Decoder configuration passed through
                                   from configuration API.
        """
        super(BlocksNMTVanillaDecoder, self).__init__(decoder_args)
        self.config = config
        self.set_up_decoder(nmt_model_path)
        self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID)
    
    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        self.nmt_model = NMTModel(self.config)
        self.nmt_model.set_up()
        loader = LoadNMTUtils(nmt_model_path,
                              self.config['saveto'],
                              self.nmt_model.search_model)
        loader.load_weights()
        self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
                if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
        if self.config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                 samples=self.nmt_model.samples, 
                                 trg_sparse_feat_map=self.trg_sparse_feat_map) 
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=self.nmt_model.samples)
    
    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                src_sentence,
                self.config['src_vocab_size'])) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1: # sparse src feats
            input_ = np.transpose(
                            np.tile(seq, (self.config['beam_size'], 1, 1)),
                            (2,0,1))
        else: # word ids on the source side
            input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
                    input_values={self.nmt_model.sampling_input: input_},
                    max_length=3*len(src_sentence),
                    eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx],1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos
    
    def has_predictors(self):
        """Always returns true. """
        return True
Ejemplo n.º 12
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """

    def __init__(self,
                 input_dims,
                 input_num_chars,
                 eos_label,
                 num_phonemes,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, character_map=None,
                 bidir=True,
                 subsample=None,
                 dims_top=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition, dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample, bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1,
                                      dim_dec if
                                      dim_output_embedding is None
                                      else dim_output_embedding)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter")
            if lm:
                # In case we use LM it is Readout that is responsible
                # for normalization.
                emitter = LMEmitter()
        elif criterion['name'].startswith('mse'):
            emitter = RewardRegressionEmitter(
                criterion['name'], eos_label, num_phonemes,
                criterion.get('min_reward', -1.0),
                name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_config = dict(
            readout_dim=num_phonemes,
            source_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            emitter=emitter,
            feedback_brick=feedback,
            name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_phonemes]).apply,
            ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm and lm.get('path'):
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn("Beam search is prone to fail with no log-prob normalization")
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(lm_costs_name='lm_add',
                                           lm_weight=lm_weight,
                                           normalize_am_weights=normalize_am_weights,
                                           normalize_lm_weights=normalize_lm_weights,
                                           normalize_tot_weights=normalize_tot_weights,
                                           am_beta=am_beta,
                                           **readout_config)

        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            language_model=language_model,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {'weights_init': self.rec_weights_init,
                                  'recurrent_weights_init': self.rec_weights_init}
            global_push_initialization_config(self,
                                              rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(self,
                                              {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, **kwargs):
        # pop inputs we know about
        inputs_mask = kwargs.pop('inputs_mask')
        labels = kwargs.pop('labels')
        labels_mask = kwargs.pop('labels_mask')

        # the rest is for bottom
        bottom_processed = self.bottom.apply(**kwargs)
        encoded, encoded_mask = self.encoder.apply(
            input_=bottom_processed,
            mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(
            labels, labels_mask,
            attended=encoded, attended_mask=encoded_mask)

    @application
    def generate(self, **kwargs):
        inputs_mask = kwargs.pop('inputs_mask')
        n_steps = kwargs.pop('n_steps')

        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(**kwargs),
            mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=n_steps if n_steps is not None else self.n_steps,
            batch_size=encoded.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        with open(path, 'r') as src:
            param_values = load_parameters(src)
        Model(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self, use_mask=True, n_steps=None):
        inputs_mask = None
        if use_mask:
            inputs_mask = self.inputs_mask
        bottom_inputs = self.inputs
        return self.generate(n_steps=n_steps,
                             inputs_mask=inputs_mask,
                             **bottom_inputs)

    def get_cost_graph(self, batch=True,
                       prediction=None, prediction_mask=None):

        if batch:
            inputs = self.inputs
            inputs_mask = self.inputs_mask
            groundtruth = self.labels
            groundtruth_mask = self.labels_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = None

        if not prediction:
            prediction = groundtruth
        if not prediction_mask:
            prediction_mask = groundtruth_mask

        cost = self.cost(inputs_mask=inputs_mask,
                         labels=prediction,
                         labels_mask=prediction_mask,
                         **inputs)
        cost_cg = ComputationGraph(cost)
        if self.criterion['name'].startswith("mse"):
            placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
            cost_cg = cost_cg.replace({placeholder: groundtruth})
        return cost_cg

    def analyze(self, inputs, groundtruth, prediction=None):
        """Compute cost and aligment."""

        input_values_dict = dict(inputs)
        input_values_dict['groundtruth'] = groundtruth
        if prediction is not None:
            input_values_dict['prediction'] = prediction
        if not hasattr(self, "_analyze"):
            input_variables = list(self.single_inputs.values())
            input_variables.append(self.single_labels.copy(name='groundtruth'))

            prediction_variable = tensor.lvector('prediction')
            if prediction is not None:
                input_variables.append(prediction_variable)
                cg = self.get_cost_graph(
                    batch=False, prediction=prediction_variable[:, None])
            else:
                cg = self.get_cost_graph(batch=False)
            cost = cg.outputs[0]

            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)

            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros_like(weights)]

            states, = VariableFilter(
                applications=[self.encoder.apply], roles=[OUTPUT],
                name="encoded")(cg)

            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]

            self._analyze = theano.function(
                input_variables,
                [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output,
                on_unused_input='warn')
        return self._analyze(**input_values_dict)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, inputs, **kwargs):
        # When a recognizer is unpickled, self.beam_size is available
        # but beam search has to be recompiled.

        self.init_beam_search(self.beam_size)
        inputs = dict(inputs)
        max_length = int(self.bottom.num_time_steps(**inputs) /
                         self.max_decoded_length_scale)
        search_inputs = {}
        for var in self.inputs.values():
            search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...]
        if inputs:
            raise Exception(
                'Unknown inputs passed to beam search: {}'.format(
                    inputs.keys()))
        outputs, search_costs = self._beam_search.search(
            search_inputs, self.eos_label,
            max_length,
            ignore_first_eol=self.data_prepend_eos,
            **kwargs)
        return outputs, search_costs

    def init_generate(self):
        generated = self.get_generate_graph(use_mask=False)
        cg = ComputationGraph(generated['outputs'])
        self._do_generate = cg.get_theano_function()

    def sample(self, inputs, n_steps=None):
        if not hasattr(self, '_do_generate'):
            self.init_generate()
        batch, unused_mask = self.bottom.single_to_batch_inputs(inputs)
        batch['n_steps'] = n_steps if n_steps is not None \
            else int(self.bottom.num_time_steps(**batch) /
                     self.max_decoded_length_scale)
        return self._do_generate(**batch)[0]

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Ejemplo n.º 13
0
class AccValidator(SimpleExtension):
    """Implements early stopping based on accuracy score.
    """
    
    def __init__(self,
                 source_sentence,
                 samples,
                 model,
                 data_stream,
                 config,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 store_full_main_loop=False,
                 **kwargs):
        """Creates a new extension which adds model selection based on
            the accuracy score to the training main loop.
            
        Args:
            source_sentence (Variable): Input variable to the sampling
            computation graph
            samples (Variable): Samples variable of the CG
            model (NMTModel): See the model module
            data_stream (DataStream): Data stream to the development
            set
            config (dict): NMT configuration
            n_best (int): beam size
            track_n_models (int): Number of n-best models for which to
            create checkpoints.
            normalize (boolean): Enables length normalization
            store_full_main_loop (boolean): Stores the iteration state
            in the old style of
            Blocks 0.1. Not recommended
            """
        super(AccValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.best_models = []
        self.val_bleu_curve = []
        
        self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \
            else FlatSparseFeatMap()
        if config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                                samples=samples,
                                                trg_sparse_feat_map=self.trg_sparse_feat_map)
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=samples)
        
        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])
        
        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                                     'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()
                # Track n best previous bleu scores
                for i, bleu in enumerate(
                    sorted(self.val_bleu_curve, reverse=True)):
                        if i < self.track_n_models:
                            self.best_models.append(ModelInfo(bleu))
                logging.info("BleuScores Reloaded")
            except:
                logging.info("BleuScores not Found")

        self.verbose = self.config.get('val_set_out', None)
        utils.load_trg_wmap(self.config['trg_wmap'])
        self.trg_wmap = utils.trg_wmap
                    
#    def __init__(self, *args, **kwargs):
#        
#        super(AccValidator, self).__init__(*args, **kwargs)
#        self.verbose = self.config.get('val_set_out', None)
#        utils.load_trg_wmap(self.config['trg_wmap'])
#        self.trg_wmap = utils.trg_wmap

    def do(self, which_callback, *args):
        """Decodes the dev set and stores checkpoints in case the BLEU
        score has improved.
        """
        #if self.main_loop.status['iterations_done'] <= \
        #        self.config['val_burn_in']:
        if self.main_loop.status['epochs_done'] <= self.config['val_burn_in']:
            return
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):
        """Evaluate model and store checkpoints. """
        logging.info("Started Validation: ")
        val_start_time = time.time()
        total_cost = 0.0
        if self.verbose:
            ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8')
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                                                                    line[0], self.config['src_vocab_size']))
            if self.src_sparse_feat_map.dim > 1: # sparse src feats
                input_ = numpy.transpose(
                                         numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                                         (2,0,1))
            else: # word ids on the source side
                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                                        input_values={self.source_sentence: input_},
                                        max_length=3*len(line[0]), eol_symbol=utils.EOS_ID,
                                        ignore_first_eol=True)
                    #            if i < 10:
                    #                logging.info("ID: {}".format(i))
                    #                logging.info("Source: {}".format(line[0]))
                    #                for k, tran in enumerate(trans):
                    #                    logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap)))
                    #                    logging.info("{}".format(costs[k]))
                    # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths
                                
            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans = trans[best]
                    if trans and trans[-1] == utils.EOS_ID:
                        trans = trans[:-1]
                    trans_out = ' '.join([str(w) for w in trans])
                except ValueError:
                    logging.info(
                             "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'
                    trans = 0
                if j == 0:
                    # Write to subprocess and file if it exists
                    ##print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans)
            if i != 0 and i % 100 == 0:
                logging.info(
                    "Translated {} lines of validation set...".format(i))
                                        
        logging.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()
        logging.info("Validation Took: {} minutes".format(
                                                           float(time.time() - val_start_time) / 60.))
        logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']))
        bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8"))
        self.val_bleu_curve.append(bleu_score)
        logging.info(bleu_score)
        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models,
                                       key=operator.attrgetter('bleu_score')).bleu_score < bleu_score:
            return True
        return False
            
    def save_parameter_values(self, param_values, path):
        ''' This method is copied from blocks.machine_translation.checkpoint '''
        param_values = {name.replace("/", "-"): param
            for name, param in param_values.items()}
        numpy.savez(path, **param_values)
            
    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'])
            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logging.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)
            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('bleu_score'))
            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            # fs439: introduce store_full_main_loop and
            # storing best_bleu_params_* files
            if self.store_full_main_loop:
                logging.info("Saving full main loop model {}".format(model.path))
                numpy.savez(model.path,
                            **self.main_loop.model.get_parameter_dict())
            else:
                logging.info("Saving model parameters {}".format(model.path))
                params_to_save = self.main_loop.model.get_parameter_values()
                self.save_parameter_values(params_to_save, model.path)
            numpy.savez(
                        os.path.join(self.config['saveto'], 'val_bleu_scores.npz'),
                        bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 14
0
class F1Validator(SimpleExtension, SamplingBase):
    # TODO: a lot has been changed in NMT, sync respectively
    """Implements early stopping based on F1 score."""
    def __init__(self,
                 samples,
                 model,
                 data_stream,
                 config,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        # TODO: change config structure
        super(F1Validator, self).__init__(**kwargs)
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.vocab = config["src_vocab"]
        self.unk_sym = config["unk_token"]
        self.eos_sym = config["eos_token"]
        self.trg_vocab = config["trg_vocab"]
        self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}
        self.trg_eos_idx = self.trg_vocab[config["eos_token"]]
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_f1_curve = []
        self.beam_search = BeamSearch(samples=samples)

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                f1_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_f1_scores.npz'))
                self.val_f1_curve = f1_score['f1_scores'].tolist()

                # Track n best previous f1 scores
                for i, f1 in enumerate(sorted(self.val_f1_curve,
                                              reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(f1))
                logger.info("F1Scores Reloaded")
            except:
                logger.info("F1Scores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= \
                self.config['val_burn_in']:
            return

        # Evaluate and save if necessary
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):

        logger.info("Started Validation: ")
        val_start_time = time.time()
        total_cost = 0.0

        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')

        C = 0
        S = 0
        I = 0
        D = 0
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """
            def tile(x, beam_size):
                return numpy.tile(x, (beam_size, ) + (1, ) * x.ndim)

            beam_size = self.config['beam_size']
            available_inputs = dict(
                zip(["sampling_%s" % x for x in self.data_stream.sources],
                    line))
            input_values = OrderedDict([(input,
                                         tile(available_inputs[input.name],
                                              beam_size))
                                        for input in self.model.inputs])
            seq = available_inputs["sampling_words"]
            reference = available_inputs["sampling_punctuation_marks"]

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values=input_values,
                    max_length=len(seq), eol_symbol=self.trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)
                    reference = self._idx_to_word(reference, self.trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i +
                                                                         1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Compute F-Measure
                    keywords = [
                        '<FULL_STOP>', '<COMMA>', '<QUESTION_MARK>',
                        '<EXCLAMATION_MARK>', '<DOTS>'
                    ]

                    merged_tokens = zip(reference.split(), trans_out.split())
                    for (x, y) in merged_tokens:
                        if x == y:
                            if x in keywords:
                                C += 1
                        else:
                            if x in keywords and y in keywords:
                                S += 1
                            elif x not in keywords:
                                I += 1
                            elif y not in keywords:
                                D += 1

                        # If beam returns too short answer
                        if len(reference) > len(trans_out.split()):
                            D += len([
                                w for w in reference[len(trans_out.split()):]
                                if w in keywords
                            ])

                    if self.verbose:
                        print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                f1_score = self.compute_f1_score(C, S, I, D)
                logger.info(
                    "Translated {} lines of validation set... F1 = {}, {}, {}, {}, {}"
                    .format(i, f1_score, C, S, I, D))

        # extract the score
        f1_score = self.compute_f1_score(C, S, I, D)
        self.val_f1_curve.append(f1_score)
        logger.info(f1_score)

        logger.info("Total cost of the validation: {}".format(total_cost))
        logger.info(
            "Translated {} lines of validation set... F1 = {}, {}, {}, {}, {}".
            format(i, f1_score, C, S, I, D))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        logger.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))

        return f1_score

    def compute_f1_score(self, C, S, I, D):
        C += 0.0001
        precision = float(C) / (C + S + I)
        recall = float(C) / (C + S + D)
        f1 = (2.0 * precision * recall) / (precision + recall)

        return f1

    def _is_valid_to_save(self, f1_score):
        if not self.best_models or min(
                self.best_models,
                key=operator.attrgetter('f1_score')).f1_score < f1_score:
            return True
        return False

    def _save_model(self, f1_score):
        if self._is_valid_to_save(f1_score):
            model = ModelInfo(f1_score, self.config['saveto'])

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('f1_score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))
            params_to_save = self.main_loop.model.get_parameter_values()
            param_values = {
                name.replace("/", BRICK_DELIMITER): param
                for name, param in params_to_save.items()
            }
            numpy.savez(model.path, **param_values)

            numpy.savez(os.path.join(self.config['saveto'],
                                     'val_f1_scores.npz'),
                        f1_scores=self.val_f1_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 15
0
_, samples = VariableFilter(bricks=[decoder.sequence_generator],
                            name="outputs")(ComputationGraph(generated[1]))
beam_search = BeamSearch(samples=samples)

# Read from standard input
stream = get_stdin_stream(**config)

vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'],
                  config['unk_id'], config['eos_id'], config['bos_id'])
inv_vocab = {v: k for k, v in vocab.iteritems()}

unk_id = config['unk_id']
eos_id = config['eos_id']

for sample in stream.get_epoch_iterator():
    seq = sample[0]
    input_ = np.tile(seq, (config['beam_size'], 1))

    trans, costs = beam_search.search(input_values={sampling_input: input_},
                                      max_length=3 * len(seq),
                                      eol_symbol=eos_id,
                                      ignore_first_eol=True)

    trans_indices = [idx for idx in trans[0]
                     if idx != eos_id]  # remove </S> from output
    trans_out = ' '.join(
        inv_vocab.get(idx, config['unk_token']) for idx in trans_indices)

    print trans_out
Ejemplo n.º 16
0
class BlocksVanillaDecoder(cam.sgnmt.decoding.core.Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding.
    """
    def __init__(self, nmt_model_path, config):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
        """
        super(BlocksVanillaDecoder, self).__init__()
        self.config = config
        self.set_up_decoder(nmt_model_path)

    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        # Create Theano variables
        logging.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Construct model
        logging.info('Building RNN encoder-decoder')
        encoder = BidirectionalEncoder(self.config['src_vocab_size'],
                                       self.config['enc_embed'],
                                       self.config['enc_nhids'])
        decoder = Decoder(self.config['trg_vocab_size'],
                          self.config['dec_embed'], self.config['dec_nhids'],
                          self.config['enc_nhids'] * 2)
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model (TODO: do i really need this?)
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            self.config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization (TODO: remove?)
        if self.config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, self.config['dropout'])

        # Apply weight noise for regularization (TODO: remove?)
        if self.config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             self.config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logging.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.info('    {:15}: {}'.format(shape, count))
        logging.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.info('    {:15}: {}'.format(value.get_value().shape, name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")

        # Set extensions
        logging.info("Initializing extensions")

        # Set up beam search and sampling computation graphs if necessary
        logging.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

        # Compare with blocks.machine_translation.BleuValidator.__init__
        self.source_sentence = sampling_input
        self.samples = samples
        self.model = search_model
        self.normalize = True
        self.verbose = self.config.get('val_set_out', None)

        # Reload model if necessary
        if self.config['reload']:
            loader = LoadNMT(nmt_model_path, self.config['saveto'],
                             search_model)
            loader.load_weights()

        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)

    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self._oov_to_unk(src_sentence, self.config['src_vocab_size'],
                               utils.UNK_ID) + [utils.EOS_ID]
        input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
            input_values={self.source_sentence: input_},
            max_length=3 * len(src_sentence),
            eol_symbol=utils.EOS_ID,
            ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos

    def _oov_to_unk(self, seq, vocab_size, unk_idx):
        return [x if x < vocab_size else unk_idx for x in seq]

    def has_predictors(self):
        """Always returns true. """
        return True
Ejemplo n.º 17
0
class BleuValidator(SimpleExtension, SamplingBase):
    # TODO: a lot has been changed in NMT, sync respectively
    """Implements early stopping based on BLEU score."""

    def __init__(self, source_sentence, samples, model, data_stream,
                 config, n_best=1, track_n_models=1,
                 normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['val_set_grndtruth'], '<']

        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= \
                self.config['val_burn_in']:
            return

        # Evaluate and save if necessary
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        # Get target vocabulary
        sources = self._get_attr_rec(self.main_loop, 'data_stream')
        trg_vocab = sources.data_streams[1].dataset.dictionary
        self.trg_ivocab = {v: k for k, v in trg_vocab.items()}
        trg_eos_sym = sources.data_streams[1].dataset.eos_token
        self.trg_eos_idx = trg_vocab[trg_eos_sym]

        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            seq = self._oov_to_unk(
                line[0], self.config['src_vocab_size'], self.unk_idx)
            input_ = numpy.tile(seq, (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=self.trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logger.info(bleu_score)
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models,
           key=operator.attrgetter('bleu_score')).bleu_score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'])

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('bleu_score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))
            numpy.savez(
                model.path, **self.main_loop.model.get_parameter_dict())
            numpy.savez(
                os.path.join(self.config['saveto'], 'val_bleu_scores.npz'),
                bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 18
0
class BleuValidator(SimpleExtension):
    """Implements early stopping based on BLEU score. This class is 
    still very similar to the ``BleuValidator`` in the NMT Blocks
    example.
    
    TODO: Refactor, make this more similar to the rest of SGNMT, use
    vanilla_decoder.py
    """

    def __init__(self, 
                 source_sentence, 
                 samples, 
                 model, 
                 data_stream,
                 config, 
                 n_best=1, 
                 track_n_models=1,
                 normalize=True, 
                 store_full_main_loop=False, 
                 **kwargs):
        """Creates a new extension which adds model selection based on
        the BLEU score to the training main loop.
        
        Args:
            source_sentence (Variable): Input variable to the sampling
                                        computation graph
            samples (Variable): Samples variable of the CG
            model (NMTModel): See the model module
            data_stream (DataStream): Data stream to the development 
                                      set
            config (dict): NMT configuration
            n_best (int): beam size
            track_n_models (int): Number of n-best models for which to 
                                  create checkpoints.
            normalize (boolean): Enables length normalization
            store_full_main_loop (boolean): Stores the iteration state
                                            in the old style of
                                            Blocks 0.1. Not recommended
        """
        super(BleuValidator, self).__init__(**kwargs)
        self.store_full_main_loop = store_full_main_loop
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.best_models = []
        self.val_bleu_curve = []
        self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split()
        logging.debug("BLEU command: %s" % self.multibleu_cmd)

        self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \
                                                                 else FlatSparseFeatMap()
        if config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                                 samples=samples, 
                                 trg_sparse_feat_map=self.trg_sparse_feat_map) 
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=samples)
        
        # Create saving directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(os.path.join(self.config['saveto'],
                                        'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()
                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logging.info("BleuScores Reloaded")
            except:
                logging.info("BleuScores not Found")

    def do(self, which_callback, *args):
        """Decodes the dev set and stores checkpoints in case the BLEU
        score has improved.
        """
        if self.main_loop.status['iterations_done'] <= \
                self.config['val_burn_in']:
            return
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):
        """Evaluate model and store checkpoints. """
        logging.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0
        ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w')
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                line[0], self.config['src_vocab_size']))
            if self.src_sparse_feat_map.dim > 1: # sparse src feats
                input_ = numpy.transpose(
                             numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                             (2,0,1))
            else: # word ids on the source side
                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans = trans[best]
                    if trans and trans[-1] == utils.EOS_ID:
                        trans = trans[:-1]
                    trans_out = ' '.join([str(w) for w in trans])
                except ValueError:
                    logging.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'
                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    print(trans_out, file=ftrans)
            if i != 0 and i % 100 == 0:
                logging.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()
        logging.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        ftrans.close()
        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logging.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logging.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None
        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logging.info(bleu_score)
        mb_subprocess.terminate()
        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models,
           key=operator.attrgetter('bleu_score')).bleu_score < bleu_score:
            return True
        return False

    def save_parameter_values(self, param_values, path):
        ''' This method is copied from blocks.machine_translation.checkpoint '''
        param_values = {name.replace("/", "-"): param
                        for name, param in param_values.items()}
        numpy.savez(path, **param_values)

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'])
            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logging.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)
            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('bleu_score'))
            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            # fs439: introduce store_full_main_loop and 
            # storing best_bleu_params_* files
            if self.store_full_main_loop:
                logging.info("Saving full main loop model {}".format(model.path))
                numpy.savez(model.path, 
                            **self.main_loop.model.get_parameter_dict())
            else:
                logging.info("Saving model parameters {}".format(model.path))
                params_to_save = self.main_loop.model.get_parameter_values()
                self.save_parameter_values(params_to_save, model.path)
            numpy.savez(
                os.path.join(self.config['saveto'], 'val_bleu_scores.npz'),
                bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 19
0
class BleuValidator(SimpleExtension, SamplingBase):
    def __init__(
        self,
        source_sentence,
        samples,
        model,
        data_stream,
        config,
        n_best=1,
        track_n_models=1,
        trg_ivocab=None,
        src_eos_idx=-1,
        trg_eos_idx=-1,
        **kwargs
    ):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.verbose = config.get("val_set_out", None)

        self.src_eos_idx = src_eos_idx
        self.trg_eos_idx = trg_eos_idx

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.trg_ivocab = trg_ivocab
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.src_eos_idx  # self.vocab[self.eos_sym]
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples)
        self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"]

        # Create saving directory if it does not exist
        if not os.path.exists(self.config["saveto"]):
            os.makedirs(self.config["saveto"])

        if self.config["reload"]:
            try:
                bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz"))
                self.val_bleu_curve = bleu_score["bleu_scores"].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status["iterations_done"] <= self.config["val_burn_in"]:
            return

        # Get current model parameters
        self.model.set_param_values(self.main_loop.model.get_param_values())

        # Evaluate and save if necessary
        self._save_model(self._evaluate_model())

    def _evaluate_model(self):

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        # Get target vocabulary
        if not self.trg_ivocab:
            sources = self._get_attr_rec(self.main_loop, "data_stream")
            trg_vocab = sources.data_streams[1].dataset.dictionary
            self.trg_ivocab = {v: k for k, v in trg_vocab.items()}

        if self.verbose:
            ftrans = open(self.config["val_set_out"], "w")

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            line[0][-1] = self.src_eos_idx
            seq = self._oov_to_unk(line[0])
            input_ = numpy.tile(seq, (self.config["beam_size"], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = self.beam_search.search(
                input_values={self.source_sentence: input_},
                max_length=3 * len(seq),
                eol_symbol=self.trg_eos_idx,
                ignore_first_eol=True,
            )

            nbest_idx = numpy.argsort(costs)[: self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out[:-1], self.trg_ivocab)

                except ValueError:
                    print "Can NOT find a translation for line: {}".format(i + 1)
                    trans_out = "<UNK>"

                if j == 0:
                    # Write to subprocess and file if it exists
                    print >> mb_subprocess.stdin, trans_out
                    if self.verbose:
                        print >> ftrans, trans_out

            if i != 0 and i % 100 == 0:
                print "Translated {} lines of validation set...".format(i)

            mb_subprocess.stdin.flush()

        print "Total cost of the validation: {}".format(total_cost)
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        print "output ", stdout
        out_parse = re.match(r"BLEU = [-.0-9]+", stdout)
        logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.0))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        print bleu_score
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(self.best_models, key=operator.attrgetter("bleu_score")).bleu_score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config["saveto"])

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter("bleu_score"))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))
            numpy.savez(model.path, **self.main_loop.model.get_param_values())
            numpy.savez(os.path.join(self.config["saveto"], "val_bleu_scores.npz"), bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 20
0
class BleuTester(TrainingExtension, SamplingBase):
    # TODO: a lot has been changed in NMT, sync respectively
    """Implements Testing BLEU score."""

    def __init__(self, source_char_seq, source_sample_matrix, source_char_aux,
                 source_word_mask, samples, model, data_stream,
                 config, testing_model, n_best=1, track_n_models=1,
                 normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuTester, self).__init__(**kwargs)
        self.source_char_seq = source_char_seq
        self.source_sample_matrix = source_sample_matrix
        self.source_char_aux = source_char_aux
        self.source_word_mask = source_word_mask
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.testing_model = testing_model
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = True

        # Helpers
        self.vocab = data_stream.dataset.dictionary
        self.src_ivocab = {v: k for k, v in self.vocab.items()}
        self.unk_sym = data_stream.dataset.unk_token
        self.eos_sym = data_stream.dataset.eos_token
        self.unk_idx = self.vocab[self.unk_sym]
        self.eos_idx = self.vocab[self.eos_sym]
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'],
                              self.config['test_set_grndtruth'], '<']

    def before_training(self):
        self._evaluate_model()

    def _evaluate_model(self):

        logger.info("Started Test: ")
        test_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE, universal_newlines=True)
        total_cost = 0.0

        # Get target vocabulary
        trg_vocab = self.data_stream.trg_vocab
        self.trg_vocab = trg_vocab
        self.trg_ivocab = {v: k for k, v in trg_vocab.items()}
        trg_eos_sym = self.data_stream.eos_token
        self.trg_eos_idx = trg_vocab[trg_eos_sym]

        if self.verbose:
            ftrans = open(os.path.join(self.testing_model, self.config['test_set_out']), 'w')

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            seq = self._oov_to_unk(
                line[0], self.config['src_vocab_size'], self.unk_idx)
            _, input_dict = self.build_input_dict(numpy.asarray(seq), self.vocab, self.config['beam_size'])

            # draw sample, checking to ensure we don't get an empty string back
            result = \
                self.beam_search.search(
                    input_values={self.source_char_seq: input_dict['source_char_seq'],
                                  self.source_sample_matrix: input_dict['source_sample_matrix'],
                                  self.source_word_mask: input_dict['source_word_mask'],
                                  self.source_char_aux: input_dict['source_char_aux']},
                    max_length=3 * len(seq), eol_symbol=self.trg_eos_idx, as_arrays=True,
                    ignore_first_eol=False)

            trans, costs = result_to_lists(result)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    try:
                        sample_length = trans_out.index(self.trg_vocab['</S>'])
                    except ValueError:
                        sample_length = len(seq)
                    trans_out = trans_out[:sample_length]
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i + 1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print("Line:", i)
                    print("Input : ", self._idx_to_word(line[0], self.src_ivocab))
                    print("Sample: ", trans_out)
                    print("Error:", costs[best])
                    print()

                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

            mb_subprocess.stdin.flush()

        logger.info("Total cost of the test: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Test Took: {} minutes".format(
            float(time.time() - test_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        logger.info(bleu_score)
        mb_subprocess.terminate()

        return bleu_score
Ejemplo n.º 21
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {
                'weights_init': self.rec_weights_init,
                'recurrent_weights_init': self.rec_weights_init
            }
            global_push_initialization_config(self, rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(
                self, {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, recordings, recordings_mask, labels, labels_mask):
        bottom_processed = self.bottom.apply(recordings)
        encoded, encoded_mask = self.encoder.apply(input_=bottom_processed,
                                                   mask=recordings_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(labels,
                                          labels_mask,
                                          attended=encoded,
                                          attended_mask=encoded_mask)

    @application
    def generate(self, recordings):
        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(recordings))
        encoded = self.top.apply(encoded)
        return self.generator.generate(n_steps=recordings.shape[0],
                                       batch_size=recordings.shape[1],
                                       attended=encoded,
                                       attended_mask=encoded_mask,
                                       as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        param_values = load_parameter_values(path)
        SpeechModel(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self):
        result = self.generate(self.recordings)
        return result

    def get_cost_graph(self, batch=True):
        if batch:
            return self.cost(self.recordings, self.recordings_mask,
                             self.labels, self.labels_mask)
        recordings = self.single_recording[:, None, :]
        labels = self.single_transcription[:, None]
        return self.cost(recordings, tensor.ones_like(recordings[:, :, 0]),
                         labels, None)

    def analyze(self, recording, transcription):
        """Compute cost and aligment for a recording/transcription pair."""
        if not hasattr(self, "_analyze"):
            cost = self.get_cost_graph(batch=False)
            cg = ComputationGraph(cost)
            energies = VariableFilter(bricks=[self.generator],
                                      name="energies")(cg)
            energies_output = [
                energies[0][:, 0, :] if energies else tensor.zeros(
                    (self.single_transcription.shape[0],
                     self.single_recording.shape[0]))
            ]
            states, = VariableFilter(applications=[self.encoder.apply],
                                     roles=[OUTPUT],
                                     name="encoded")(cg)
            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
            weights, = VariableFilter(bricks=[self.generator],
                                      name="weights")(cg)
            self._analyze = theano.function(
                [self.single_recording, self.single_transcription],
                [cost[:, 0], weights[:, 0, :]] + energies_output +
                ctc_matrix_output)
        return self._analyze(recording, transcription)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(applications=[self.generator.generate],
                                  name="outputs")(ComputationGraph(
                                      generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, recording, char_discount=0.0):
        if not hasattr(self, '_beam_search'):
            self.init_beam_search(self.beam_size)
        input_ = recording[:, numpy.newaxis, :]
        outputs, search_costs = self._beam_search.search(
            {self.recordings: input_},
            self.eos_label,
            input_.shape[0] / 3,
            ignore_first_eol=self.data_prepend_eos,
            char_discount=char_discount)
        return outputs, search_costs

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Ejemplo n.º 22
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(sampling_input,
                                         sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(bricks=[decoder.sequence_generator],
                                        name="outputs")(ComputationGraph(
                                            generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model,
                        data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input,
                              samples=samples,
                              config=config,
                              model=search_model,
                              data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(config['test_set'], config['src_vocab'],
                                     config['src_vocab_size'],
                                     config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(pickle.load(
            open(config['trg_vocab'])),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Ejemplo n.º 23
0
class BeamSearchEvaluator(object):
    def __init__(self,
                 eol_symbol,
                 beam_size,
                 x,
                 x_mask,
                 samples,
                 phoneme_dict=None,
                 black_list=None,
                 language_model=False):
        if black_list is None:
            self.black_list = []
        else:
            self.black_list = black_list
        self.x = x
        self.x_mask = x_mask
        self.eol_symbol = eol_symbol
        self.beam_size = beam_size
        if language_model:
            lm = TrigramLanguageModel()
            ind_to_word = dict(enumerate(lm.unigrams))
            self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size,
                                            samples)
        else:
            self.beam_search = BeamSearch(beam_size, samples)
        self.beam_search.compile()
        self.phoneme_dict = phoneme_dict

    def evaluate(self,
                 data_stream,
                 train=False,
                 file_pred=None,
                 file_targets=None):
        loss = 0.
        num_examples = 0
        iterator = data_stream.get_epoch_iterator()
        if train:
            print 'Train evaluation started'
        i = 0
        for inputs in iterator:
            inputs = dict(zip(data_stream.sources, inputs))
            x_mask_val = inputs['features_mask']
            x_val = inputs['features']
            y_val = inputs['phonemes']
            y_mask_val = inputs['phonemes_mask']
            for batch_ind in xrange(inputs['features'].shape[1]):
                if x_val.ndim == 2:
                    input_beam = numpy.tile(x_val[:, batch_ind][:, None],
                                            (1, self.beam_size))
                else:
                    input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :],
                                            (1, self.beam_size, 1))
                input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None],
                                             (1, self.beam_size))
                predictions, _ = self.beam_search.search(
                    {
                        self.x: input_beam,
                        self.x_mask: input_mask_beam
                    }, self.eol_symbol, 100)
                predictions = [
                    self.phoneme_dict[phone_ind]
                    for phone_ind in predictions[0]
                    if self.phoneme_dict[phone_ind] not in self.black_list
                ][1:-1]

                targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind]
                targets = [
                    self.phoneme_dict[phone_ind] for phone_ind in targets
                    if self.phoneme_dict[phone_ind] not in self.black_list
                ][1:-1]
                predictions = [x[0] for x in groupby(predictions)]
                targets = [x[0] for x in groupby(targets)]
                i += 1
                if file_pred:
                    file_pred.write(' '.join(predictions) + '(%d)\n' % i)
                if file_targets:
                    file_targets.write(' '.join(targets) + '(%d)\n' % i)

                loss += Evaluation.wer([predictions], [targets])
                num_examples += 1

            print '.. found sequence example:', ' '.join(predictions)
            print '.. real output was:       ', ' '.join(targets)
            if train:
                break
        if train:
            print 'Train evaluation finished'
        per = loss.sum() / num_examples
        return {'per': per}
Ejemplo n.º 24
0
class SpeechRecognizer(Initializable):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """
    def __init__(self, recordings_source, labels_source, eos_label,
                 num_features, num_phonemes,
                 dim_dec, dims_bidir, dims_bottom,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 lm=None, character_map=None,
                 subsample=None,
                 dims_top=None,
                 prior=None, conv_n=None,
                 bottom_activation=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
                 **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition, dims_bidir,
                          dims_bottom[-1] if len(dims_bottom) else num_features,
                          subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter")
        readout_config = dict(
            readout_dim=num_phonemes,
            source_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            emitter=emitter,
            feedback_brick=feedback,
            name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_phonemes]).apply,
            ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn("Beam search is prone to fail with no log-prob normalization")
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(lm_costs_name='lm_add',
                                           lm_weight=lm_weight,
                                           normalize_am_weights=normalize_am_weights,
                                           normalize_lm_weights=normalize_lm_weights,
                                           normalize_tot_weights=normalize_tot_weights,
                                           am_beta=am_beta,
                                           **readout_config)

        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            language_model=language_model,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [self.recordings, self.recordings_source,
                             self.labels, self.labels_mask]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)

    def push_initialization_config(self):
        super(SpeechRecognizer, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {'weights_init': self.rec_weights_init,
                                  'recurrent_weights_init': self.rec_weights_init}
            global_push_initialization_config(self,
                                              rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(self,
                                              {'initial_states_init': self.initial_states_init})

    @application
    def cost(self, recordings, recordings_mask, labels, labels_mask):
        bottom_processed = self.bottom.apply(recordings)
        encoded, encoded_mask = self.encoder.apply(
            input_=bottom_processed,
            mask=recordings_mask)
        encoded = self.top.apply(encoded)
        return self.generator.cost_matrix(
            labels, labels_mask,
            attended=encoded, attended_mask=encoded_mask)

    @application
    def generate(self, recordings):
        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(recordings))
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=recordings.shape[0], batch_size=recordings.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            as_dict=True)

    def load_params(self, path):
        generated = self.get_generate_graph()
        param_values = load_parameter_values(path)
        SpeechModel(generated['outputs']).set_parameter_values(param_values)

    def get_generate_graph(self):
        result = self.generate(self.recordings)
        return result

    def get_cost_graph(self, batch=True):
        if batch:
            return self.cost(
                self.recordings, self.recordings_mask,
                self.labels, self.labels_mask)
        recordings = self.single_recording[:, None, :]
        labels = self.single_transcription[:, None]
        return self.cost(
            recordings, tensor.ones_like(recordings[:, :, 0]),
            labels, None)

    def analyze(self, recording, transcription):
        """Compute cost and aligment for a recording/transcription pair."""
        if not hasattr(self, "_analyze"):
            cost = self.get_cost_graph(batch=False)
            cg = ComputationGraph(cost)
            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros((self.single_transcription.shape[0],
                                                  self.single_recording.shape[0]))]
            states, = VariableFilter(
                applications=[self.encoder.apply], roles=[OUTPUT],
                name="encoded")(cg)
            ctc_matrix_output = []
            # Temporarily disabled for compatibility with LM code
            # if len(self.generator.readout.source_names) == 1:
            #    ctc_matrix_output = [
            #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)
            self._analyze = theano.function(
                [self.single_recording, self.single_transcription],
                [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output)
        return self._analyze(recording, transcription)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        self.beam_size = beam_size
        generated = self.get_generate_graph()
        samples, = VariableFilter(
            applications=[self.generator.generate], name="outputs")(
            ComputationGraph(generated['outputs']))
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, recording, char_discount=0.0):
        if not hasattr(self, '_beam_search'):
            self.init_beam_search(self.beam_size)
        input_ = recording[:,numpy.newaxis,:]
        outputs, search_costs = self._beam_search.search(
            {self.recordings: input_}, self.eos_label, input_.shape[0] / 3,
            ignore_first_eol=self.data_prepend_eos,
            char_discount=char_discount)
        return outputs, search_costs

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass
Ejemplo n.º 25
0
class BleuValidator(SimpleExtension, SamplingBase):
    """Implements early stopping based on BLEU score."""
    def __init__(self,
                 source_sentence,
                 target_prefix,
                 samples,
                 model,
                 data_stream,
                 config,
                 src_vocab=None,
                 trg_vocab=None,
                 n_best=1,
                 track_n_models=1,
                 normalize=True,
                 **kwargs):
        super(BleuValidator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.target_prefix = target_prefix

        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.track_n_models = track_n_models
        self.normalize = normalize
        self.verbose = config.get('val_set_out', None)

        # Helpers
        self.best_models = []
        self.val_bleu_curve = []
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = [
            'perl', self.config['bleu_script'],
            self.config['val_set_grndtruth'], '<'
        ]

        # Create save directory if it does not exist
        if not os.path.exists(self.config['saveto']):
            os.makedirs(self.config['saveto'])

        if self.config['reload']:
            try:
                bleu_score = numpy.load(
                    os.path.join(self.config['saveto'], 'val_bleu_scores.npz'))
                self.val_bleu_curve = bleu_score['bleu_scores'].tolist()

                # Track n best previous bleu scores
                for i, bleu in enumerate(
                        sorted(self.val_bleu_curve, reverse=True)):
                    if i < self.track_n_models:
                        self.best_models.append(ModelInfo(bleu, key='BLEU'))
                logger.info("BleuScores Reloaded")
            except:
                logger.info("BleuScores not Found")

    def do(self, which_callback, *args):

        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config[
                'val_burn_in']:
            return

        # Evaluate the model
        bleu_score = self._evaluate_model()
        # add an entry to the log
        self.main_loop.log.current_row[
            'validation_set_bleu_score'] = bleu_score
        # save if necessary
        self._save_model(bleu_score)

    def _evaluate_model(self):
        # Set in the superclass -- SamplingBase
        if not hasattr(self, 'target_dataset'):
            self._initialize_dataset_info()

        #         self.unk_sym = self.target_dataset.unk_token
        #         self.eos_sym = self.target_dataset.eos_token

        self.unk_sym = '<UNK>'
        self.eos_sym = '</S>'
        self.unk_idx = self.trg_vocab[self.unk_sym]
        self.eos_idx = self.trg_vocab[self.eos_sym]

        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')

        print('LENGTH OF DEV STREAM: {}'.format(
            len(list(self.data_stream.get_epoch_iterator()))))
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            # Note that the indices of source and target in the datastream are hard-coded
            # currently our datastream is (source,target,prefix,suffix)
            seq = self._oov_to_unk(line[0], self.config['src_vocab_size'],
                                   self.unk_idx)

            target_prefix = line[2]

            input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            prefix_input_ = numpy.tile(target_prefix,
                                       (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            # beam search param names come from WHERE??
            trans, costs = self.beam_search.search(input_values={
                self.source_sentence:
                input_,
                self.target_prefix:
                prefix_input_
            },
                                                   max_length=3 * len(seq),
                                                   eol_symbol=self.eos_idx,
                                                   ignore_first_eol=False)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]

            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # convert idx to words
                    #print('input_seq: {}'.format(seq))
                    #print('input_prefix: {}'.format(target_prefix))
                    #print('trans_out_raw: {}'.format(trans_out))
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)
#print('trans_out_text: {}'.format(trans_out))

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i +
                                                                         1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)

                if i != 0 and i % 100 == 0:
                    logger.info(
                        "Translated {} lines of validation set...".format(i))

                mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logger.info(bleu_score)
        mb_subprocess.terminate()

        return bleu_score

    def _is_valid_to_save(self, bleu_score):
        if not self.best_models or min(
                self.best_models,
                key=operator.attrgetter('score')).score < bleu_score:
            return True
        return False

    def _save_model(self, bleu_score):
        if self._is_valid_to_save(bleu_score):
            model = ModelInfo(bleu_score, self.config['saveto'], key='BLEU')

            # Manage n-best model list first
            if len(self.best_models) >= self.track_n_models:
                old_model = self.best_models[0]
                if old_model.path and os.path.isfile(old_model.path):
                    logger.info("Deleting old model %s" % old_model.path)
                    os.remove(old_model.path)
                self.best_models.remove(old_model)

            self.best_models.append(model)
            self.best_models.sort(key=operator.attrgetter('score'))

            # Save the model here
            s = signal.signal(signal.SIGINT, signal.SIG_IGN)
            logger.info("Saving new model {}".format(model.path))

            SaveLoadUtils.save_parameter_values(
                self.main_loop.model.get_parameter_values(), model.path)
            numpy.savez(os.path.join(self.config['saveto'],
                                     'val_bleu_scores.npz'),
                        bleu_scores=self.val_bleu_curve)
            signal.signal(signal.SIGINT, s)
Ejemplo n.º 26
0
class BleuEvaluator(SimpleExtension, SamplingBase):
    def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config,
                 val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs):
        # TODO: change config structure
        super(BleuEvaluator, self).__init__(**kwargs)
        self.source_sentence = source_sentence
        self.samples = samples
        self.model = model
        self.data_stream = data_stream
        self.config = config
        self.n_best = n_best
        self.normalize = normalize
        self.val_out = val_out
        self.val_best_out = val_out and val_best_out
        self.bleu_scores = []

        self.trg_ivocab = None
        self.unk_id = config['unk_id']
        self.eos_id = config['eos_id']
        self.beam_search = BeamSearch(samples=samples)
        self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<']

    def do(self, which_callback, *args):
        # Track validation burn in
        if self.main_loop.status['iterations_done'] <= self.config['val_burn_in']:
            return

        self._evaluate_model()

    def _evaluate_model(self):
        logger.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        if self.trg_ivocab is None:
            sources = self._get_attr_rec(self.main_loop, 'data_stream')
            trg_vocab = sources.data_streams[1].dataset.dictionary
            self.trg_ivocab = {v: k for k, v in trg_vocab.items()}

        if self.val_out:
            output_file = open(self.val_out, 'w')

        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """

            seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_id)
            input_ = numpy.tile(seq, (self.config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = self.beam_search.search(
                input_values={self.source_sentence: input_},
                max_length=3 * len(seq), eol_symbol=self.eos_id,
                ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans_out = trans[best]

                    # keeping eos tokens reduces BLEU score
                    if self.config['remove_eos']:
                        trans_out = [idx for idx in trans_out if idx != self.eos_id]
                    # however keeping unk tokens might be a good idea (avoids brevity penalty)
                    if self.config['remove_unk']:
                        trans_out = [idx for idx in trans_out if idx != self.unk_id]

                    # convert idx to words
                    trans_out = self._idx_to_word(trans_out, self.trg_ivocab)

                except ValueError:
                    logger.info("Can NOT find a translation for line: {}".format(i + 1))
                    trans_out = '<UNK>'

                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.val_out:
                        print(trans_out, file=output_file)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()

        logger.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.val_out:
            output_file.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logger.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.))
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        logger.info(bleu_score)
        mb_subprocess.terminate()

        self.bleu_scores.append(bleu_score)
        if self.val_best_out and bleu_score == max(self.bleu_scores):
            shutil.copy(self.val_out, self.val_best_out)

        return bleu_score
class BlocksNMTVanillaDecoder(Decoder):
    """Adaptor class for blocks.search.BeamSearch. We implement the
    ``Decoder`` class but ignore functionality for predictors or
    heuristics. Instead, we pass through decoding directly to the 
    blocks beam search module. This is fast, but breaks with the
    predictor framework. It can only be used for pure single system
    NMT decoding. Note that this decoder supports sparse feat maps
    on both source and target side.
    """
    def __init__(self, nmt_model_path, config, decoder_args):
        """Set up the NMT model used by the decoder.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
            config (dict): NMT configuration
            decoder_args (object): Decoder configuration passed through
                                   from configuration API.
        """
        super(BlocksNMTVanillaDecoder, self).__init__(decoder_args)
        self.config = config
        self.set_up_decoder(nmt_model_path)
        self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID)

    def set_up_decoder(self, nmt_model_path):
        """This method uses the NMT configuration in ``self.config`` to
        initialize the NMT model. This method basically corresponds to 
        ``blocks.machine_translation.main``.
        
        Args:
            nmt_model_path (string):  Path to the NMT model file (.npz)
        """
        self.nmt_model = NMTModel(self.config)
        self.nmt_model.set_up()
        loader = LoadNMTUtils(nmt_model_path, self.config['saveto'],
                              self.nmt_model.search_model)
        loader.load_weights()
        self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \
                if self.config['src_sparse_feat_map'] else FlatSparseFeatMap()
        if self.config['trg_sparse_feat_map']:
            self.trg_sparse_feat_map = self.config['trg_sparse_feat_map']
            self.beam_search = SparseBeamSearch(
                samples=self.nmt_model.samples,
                trg_sparse_feat_map=self.trg_sparse_feat_map)
        else:
            self.trg_sparse_feat_map = FlatSparseFeatMap()
            self.beam_search = BeamSearch(samples=self.nmt_model.samples)

    def decode(self, src_sentence):
        """Decodes a single source sentence with the original blocks
        beam search decoder. Does not use predictors. Note that the
        score breakdowns in returned hypotheses are only on the 
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        seq = self.src_sparse_feat_map.words2dense(
            utils.oov_to_unk(src_sentence,
                             self.config['src_vocab_size'])) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1:  # sparse src feats
            input_ = np.transpose(
                np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1))
        else:  # word ids on the source side
            input_ = np.tile(seq, (self.config['beam_size'], 1))
        trans, costs = self.beam_search.search(
            input_values={self.nmt_model.sampling_input: input_},
            max_length=3 * len(src_sentence),
            eol_symbol=utils.EOS_ID,
            ignore_first_eol=True)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.config['beam_size']
        return hypos

    def has_predictors(self):
        """Always returns true. """
        return True
Ejemplo n.º 28
0
params = search_model.get_parameter_dict()
param_values = SaveLoadUtils().load_parameter_values(os.path.join(config['saveto'], 'params.npz'))
for k in params:
    params[k].set_value(param_values[k])

_, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1]))
beam_search = BeamSearch(samples=samples)

# Read from standard input
stream = get_stdin_stream(**config)

vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'], config['unk_id'], config['eos_id'], config['bos_id'])
inv_vocab = {v: k for k, v in vocab.iteritems()}

unk_id = config['unk_id']
eos_id = config['eos_id']

for sample in stream.get_epoch_iterator():
    seq = sample[0]
    input_ = np.tile(seq, (config['beam_size'], 1))

    trans, costs = beam_search.search(
            input_values={sampling_input: input_},
            max_length=3 * len(seq), eol_symbol=eos_id,
            ignore_first_eol=True)

    trans_indices = [idx for idx in trans[0] if idx != eos_id]  # remove </S> from output
    trans_out = ' '.join(inv_vocab.get(idx, config['unk_token']) for idx in trans_indices)

    print trans_out