def __init__(self, source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples, model, data_stream, config, testing_model, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(BleuTester, self).__init__(**kwargs) self.source_char_seq = source_char_seq self.source_sample_matrix = source_sample_matrix self.source_char_aux = source_char_aux self.source_word_mask = source_word_mask self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.testing_model = testing_model self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = True # Helpers self.vocab = data_stream.dataset.dictionary self.src_ivocab = {v: k for k, v in self.vocab.items()} self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['test_set_grndtruth'], '<']
def test_beam_search(): """Test beam search using the model from the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 reverser = WordReverser(10, alphabet_size) reverser.weights_init = reverser.biases_init = IsotropicGaussian(0.5) reverser.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter(bricks=[reverser.generator], name="outputs")( ComputationGraph(reverser.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)), (beam_size, 1)).T search = BeamSearch(10, samples) results, mask, costs = search.search({inputs: input_vals}, 0, 3 * length) true_costs = reverser.cost( input_vals, numpy.ones((length, beam_size), dtype=floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs, true_costs, rtol=1e-5)
def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None, language_model=False): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size if language_model: lm = TrigramLanguageModel() ind_to_word = dict(enumerate(lm.unigrams)) self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size, samples) else: self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict
def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, normalize=True, store_full_main_loop=False, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers #self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab #self.unk_sym = data_stream.dataset.unk_token #self.eos_sym = data_stream.dataset.eos_token #self.unk_idx = self.vocab[self.unk_sym] #self.eos_idx = self.vocab[self.eos_sym] self.unk_idx = 0 # fs439: TODO hardcoded self.eos_idx = 2 # fs439: TODO hardcoded self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split() print("BLEU command: %s" % self.multibleu_cmd) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load( os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
def __init__(self, source_sentence, initial_state_context, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(MeteorValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.initial_context = initial_state_context self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_meteor_curve = [] self.beam_search = BeamSearch(samples=samples) # Info for Meteor self.target_language = self.config['target_lang'] self.meteor_directory = self.config['meteor_directory'] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: meteor_score = numpy.load( os.path.join(self.config['saveto'], 'val_meteor_scores.npz')) self.val_meteor_curve = meteor_score['meteor_scores'].tolist() # Track n best previous meteor scores for i, meteor in enumerate( sorted(self.val_meteor_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(meteor, key='METEOR')) logger.info("MeteorScores Reloaded") except: logger.info("MeteorScores not Found")
def __init__(self, source_sentence, target_prefix, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.target_prefix = target_prefix self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = [ 'perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<' ] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load( os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu, key='BLEU')) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter(applications=[self.generator.generate], name="outputs")(ComputationGraph( generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile()
def __init__(self, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(F1Validator, self).__init__(**kwargs) self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.vocab = config["src_vocab"] self.unk_sym = config["unk_token"] self.eos_sym = config["eos_token"] self.trg_vocab = config["trg_vocab"] self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.trg_eos_idx = self.trg_vocab[config["eos_token"]] self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_f1_curve = [] self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: f1_score = numpy.load( os.path.join(self.config['saveto'], 'val_f1_scores.npz')) self.val_f1_curve = f1_score['f1_scores'].tolist() # Track n best previous f1 scores for i, f1 in enumerate(sorted(self.val_f1_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(f1)) logger.info("F1Scores Reloaded") except: logger.info("F1Scores not Found")
def __init__(self, source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.source_char_seq = source_char_seq self.source_sample_matrix = source_sample_matrix self.source_char_aux = source_char_aux self.source_word_mask = source_word_mask self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.vocab = data_stream.dataset.dictionary self.src_ivocab = {v: k for k, v in self.vocab.items()} self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<'] # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted([list(v.values())[0] for v in self.val_bleu_curve], reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu, self.config['saveto'])) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter(applications=[self.generator.generate], name="outputs")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile()
def test_beam_search(): """Test beam search using the model similar to the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 simple_generator = SimpleGenerator(10, alphabet_size, seed=1234) simple_generator.weights_init = IsotropicGaussian(0.5) simple_generator.biases_init = IsotropicGaussian(0.5) simple_generator.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter( applications=[simple_generator.generator.generate], name="outputs")( ComputationGraph(simple_generator.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)), (beam_size, 1)).T search = BeamSearch(samples) results, mask, costs = search.search( {inputs: input_vals}, 0, 3 * length, as_arrays=True) # Just check sum assert results.sum() == 2816 true_costs = simple_generator.cost( input_vals, numpy.ones((length, beam_size), dtype=theano.config.floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5) # Test `as_lists=True` results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length) for i in range(len(results2)): assert results2[i] == list(results.T[i, :mask.T[i].sum()])
def test_beam_search(): """Test beam search using the model similar to the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 simple_generator = SimpleGenerator(10, alphabet_size, seed=1234) simple_generator.weights_init = IsotropicGaussian(0.5) simple_generator.biases_init = IsotropicGaussian(0.5) simple_generator.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter( applications=[simple_generator.generator.generate], name="outputs")(ComputationGraph(simple_generator.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length, )), (beam_size, 1)).T search = BeamSearch(samples) results, mask, costs = search.search({inputs: input_vals}, 0, 3 * length, as_arrays=True) # Just check sum assert results.sum() == 2816 true_costs = simple_generator.cost( input_vals, numpy.ones((length, beam_size), dtype=theano.config.floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5) # Test `as_lists=True` results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length) for i in range(len(results2)): assert results2[i] == list(results.T[i, :mask.T[i].sum()])
def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, _, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) costs = costs.T outputs = list(outputs.T) costs = list(costs) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] if mode == "sample": costs[i] = costs[i][:true_length].sum() return outputs, costs
def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None, language_model=False): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size if language_model: lm = TrigramLanguageModel() ind_to_word = dict(enumerate(lm.unigrams)) self.beam_search = BeamSearchLM( lm, 1., ind_to_word, beam_size, samples) else: self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict
def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict
def __init__( self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, src_eos_idx=-1, trg_eos_idx=-1, **kwargs ): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.verbose = config.get("val_set_out", None) self.src_eos_idx = src_eos_idx self.trg_eos_idx = trg_eos_idx # Helpers self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.src_eos_idx # self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples) self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"] # Create saving directory if it does not exist if not os.path.exists(self.config["saveto"]): os.makedirs(self.config["saveto"]) if self.config["reload"]: try: bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz")) self.val_bleu_curve = bleu_score["bleu_scores"].tolist() # Track n best previous bleu scores for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config, val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs): # TODO: change config structure super(BleuEvaluator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.normalize = normalize self.val_out = val_out self.val_best_out = val_out and val_best_out self.bleu_scores = [] self.trg_ivocab = None self.unk_id = config['unk_id'] self.eos_id = config['eos_id'] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<']
def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs
def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter( applications=[self.generator.generate], name="outputs")( ComputationGraph(generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile()
def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_model = NMTModel(self.config) self.nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, self.config['saveto'], self.nmt_model.search_model) loader.load_weights() self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \ if self.config['src_sparse_feat_map'] else FlatSparseFeatMap() if self.config['trg_sparse_feat_map']: self.trg_sparse_feat_map = self.config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=self.nmt_model.samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=self.nmt_model.samples)
def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, normalize=True, store_full_main_loop=False, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers #self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab #self.unk_sym = data_stream.dataset.unk_token #self.eos_sym = data_stream.dataset.eos_token #self.unk_idx = self.vocab[self.unk_sym] #self.eos_idx = self.vocab[self.eos_sym] self.unk_idx = 0 # fs439: TODO hardcoded self.eos_idx = 2 # fs439: TODO hardcoded self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split() print("BLEU command: %s" % self.multibleu_cmd) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
def load_params_and_get_beam_search(exp_config): encoder = BidirectionalEncoder(exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) # let user specify the target transition class name in config, # eval it and pass to decoder target_transition_name = exp_config.get( 'target_transition', 'GRUInitialStateWithInitialStateSumContext') target_transition = eval(target_transition_name) decoder = InitialContextDecoder(exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, exp_config['context_dim'], target_transition) # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') sampling_context = tensor.matrix('context_input') logger.info("Building sampling model") sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) # Set the parameters logger.info("Creating Model...") model = Model(generated) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # load the parameter values from an .npz file param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters']) LoadNMT.set_model_parameters(model, param_values) return beam_search, sampling_input, sampling_context
def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter( applications=[self.generator.generate], name="outputs")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile()
def set_up_decoder(self, nmt_specs): """This method sets up a list of NMT models and BeamSearch instances, one for each model in the ensemble. Note that we do not use the ``BeamSearch.search`` method for ensemble decoding directly. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_models = [] self.beam_searches = [] for nmt_model_path, nmt_config in nmt_specs: nmt_model = NMTModel(nmt_config) nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, nmt_config['saveto'], nmt_model.search_model) loader.load_weights() self.nmt_models.append(nmt_model) self.beam_searches.append(BeamSearch(samples=nmt_model.samples))
def load_params_and_get_beam_search(exp_config): encoder = BidirectionalEncoder(exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) decoder = Decoder(exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2) # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) # Set the parameters logger.info("Creating Model...") model = Model(generated) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # load the parameter values from an .npz file if the `saved_parameters` field is present in the config param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(model, param_values) return beam_search, sampling_input
class BlocksNMTVanillaDecoder(Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. Note that this decoder supports sparse feat maps on both source and target side. """ def __init__(self, nmt_model_path, config, decoder_args): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration decoder_args (object): Decoder configuration passed through from configuration API. """ super(BlocksNMTVanillaDecoder, self).__init__(decoder_args) self.config = config self.set_up_decoder(nmt_model_path) self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_model = NMTModel(self.config) self.nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, self.config['saveto'], self.nmt_model.search_model) loader.load_weights() self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \ if self.config['src_sparse_feat_map'] else FlatSparseFeatMap() if self.config['trg_sparse_feat_map']: self.trg_sparse_feat_map = self.config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=self.nmt_model.samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=self.nmt_model.samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3*len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]] hypo.score_breakdown[0] = [(-costs[idx],1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def has_predictors(self): """Always returns true. """ return True
def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, normalize=True, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get( 'val_set_out', None) # TODO: set this to a file and True for a sentence output # Helpers ''' self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<'] ''' self.beam_search = BeamSearch(samples=samples) self.eow_idx = 2 # TODO: this is a hack # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) #if self.config['reload']: if False: try: bleu_score = numpy.load( os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found")
encoder.push_initialization_config() # push_initialization_config 已经被预先定义在Initializable里的方法 decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() sampling_representation = encoder.apply( source_sentence, tensor.ones(source_sentence.shape)) generated = decoder.generate(source_sentence, sampling_representation) # modified here to add the functions. search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1])) weights = VariableFilter(bricks=[decoder.sequence_generator],name="weights")(cg.variables) getAlignment = function([source_sentence, target_sentence], weights) beam_search = BeamSearch(samples=samples) saveTo = "/Users/lqy/Documents/search_model_fr2en_backup/" load_model = loadNMTfromFile(saveTo) model = load_model.load_to(search_model) nmt = translateSentence(config=config,model=search_model, data_stream=validate_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']) nmt.initialValue(tr_stream) unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1
def decode(self, src_sentence): """This is a generalization to NMT ensembles of ``BeamSearch.search``. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ for search in self.beam_searches: if not search.compiled: search.compile() seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.src_vocab_size)) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose(np.tile(seq, (self.beam_size, 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = np.tile(seq, (self.beam_size, 1)) contexts_and_states = [] for sys_idx in xrange(self.n_networks): contexts, states, _ = \ self.beam_searches[sys_idx].compute_initial_states_and_contexts( {self.nmt_models[sys_idx].sampling_input: input_}) contexts_and_states.append( (contexts, states, self.beam_searches[sys_idx])) # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = states['outputs'][None, :] all_masks = np.ones_like(all_outputs, dtype=config.floatX) all_costs = np.zeros_like(all_outputs, dtype=config.floatX) for i in range(3 * len(src_sentence)): if all_masks[-1].sum() == 0: break logprobs_lst = [] for contexts, states, search in contexts_and_states: logprobs_lst.append(search.compute_logprobs(contexts, states)) logprobs = np.sum(logprobs_lst, axis=0) next_costs = (all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]) (finished, ) = np.where(all_masks[-1] == 0) next_costs[finished, :utils.EOS_ID] = np.inf next_costs[finished, utils.EOS_ID + 1:] = np.inf # The `i == 0` is required because at the first step the beam # size is effectively only 1. (indexes, outputs), chosen_costs = BeamSearch._smallest( next_costs, self.beam_size, only_first_row=i == 0) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Rearrange everything for contexts, states, search in contexts_and_states: for name in states: states[name] = states[name][indexes] states.update( search.compute_next_states(contexts, states, outputs)) all_outputs = np.vstack([all_outputs, outputs[None, :]]) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != utils.EOS_ID if i == 0: mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) all_outputs = all_outputs[1:] all_masks = all_masks[:-1] all_costs = all_costs[1:] - all_costs[:-1] result = all_outputs, all_masks, all_costs trans, costs = BeamSearch.result_to_lists(result) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.beam_size return hypos
class IMT_F1_Validator(SimpleExtension, SamplingBase): """Implements early stopping based on METEOR score.""" def __init__(self, source_sentence, target_prefix, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(IMT_F1_Validator, self).__init__(**kwargs) self.source_sentence = source_sentence self.target_prefix = target_prefix self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_imt_f1_curve = [] self.beam_search = BeamSearch(samples=samples) # Info for Meteor self.target_language = self.config['target_lang'] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: imt_f1_score = numpy.load( os.path.join(self.config['saveto'], 'val_imt_f1_scores.npz')) self.val_imt_f1_curve = imt_f1_score['imt_f1_scores'].tolist() # Track n best previous f1_bad scores for i, imt_f1_val in enumerate( sorted(self.val_imt_f1_curve, reverse=True)): if i < self.track_n_models: self.best_models.append( ModelInfo(imt_f1_val, key='IMT_F1')) logger.info("IMT_F1_Scores Reloaded") except: logger.info("IMT_F1_Scores not found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config[ 'val_burn_in']: return # Evaluate the model imt_f1_score = self._evaluate_model() # add an entry to the log self.main_loop.log.current_row[ 'validation_set_imt_f1_score'] = imt_f1_score # save if necessary self._save_model(imt_f1_score) # TODO: if we are evaluating both BLEU and METEOR, we shouldn't need to translate twice!! def _evaluate_model(self): # Set in the superclass -- SamplingBase if not hasattr(self, 'target_dataset'): self._initialize_dataset_info() self.unk_sym = '<UNK>' self.eos_sym = '</S>' self.unk_idx = self.trg_vocab[self.unk_sym] self.eos_idx = self.trg_vocab[self.eos_sym] logger.info("Started Validation: ") val_start_time = time.time() ref_file = self.config['val_set_grndtruth'] trg_hyp_file = tempfile.NamedTemporaryFile(delete=False) if self.verbose: ftrans = codecs.open(self.config['val_set_out'], 'w', encoding='utf8') total_cost = 0.0 with codecs.open(trg_hyp_file.name, 'w', encoding='utf8') as hyps_out: for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ # TODO: the section with beam search and translation is shared by all validators # WORKING: switch this to IMT prefix validation # Note that the indices of source and target in the datastream are hard-coded # currently our datastream is (source,target,prefix,suffix) seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx) target_prefix = line[2] input_ = numpy.tile(seq, (self.config['beam_size'], 1)) prefix_input_ = numpy.tile(target_prefix, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back # beam search param names come from WHERE?? trans, costs = self.beam_search.search(input_values={ self.source_sentence: input_, self.target_prefix: prefix_input_ }, max_length=3 * len(seq), eol_symbol=self.eos_idx, ignore_first_eol=False) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format( i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists hyps_out.write(trans_out.decode('utf8') + '\n') if self.verbose: print(trans_out.decode('utf8'), file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() imt_f1_score, imt_precision, imt_recall = imt_f1_from_files( trg_hyp_file.name, ref_file) logger.info("IMT F1 Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) logger.info("IMT F1: {}, Precision: {}, Recall: {}".format( imt_f1_score, imt_precision, imt_recall)) return imt_f1_score def _is_valid_to_save(self, imt_f1_score): if not self.best_models or min( self.best_models, key=operator.attrgetter('score')).score < imt_f1_score: return True return False def _save_model(self, imt_f1_score): if self._is_valid_to_save(imt_f1_score): model = ModelInfo(imt_f1_score, self.config['saveto'], key='IMT_F1') # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) SaveLoadUtils.save_parameter_values( self.main_loop.model.get_parameter_values(), model.path) numpy.savez(os.path.join(self.config['saveto'], 'val_imt_f1_scores.npz'), imt_f1_scores=self.val_imt_f1_curve) signal.signal(signal.SIGINT, s)
def decode(self, src_sentence): """This is a generalization to NMT ensembles of ``BeamSearch.search``. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ for search in self.beam_searches: if not search.compiled: search.compile() seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( src_sentence, self.src_vocab_size)) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.beam_size, 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (self.beam_size, 1)) contexts_and_states = [] for sys_idx in xrange(self.n_networks): contexts, states, _ = \ self.beam_searches[sys_idx].compute_initial_states_and_contexts( {self.nmt_models[sys_idx].sampling_input: input_}) contexts_and_states.append((contexts, states, self.beam_searches[sys_idx])) # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = states['outputs'][None, :] all_masks = np.ones_like(all_outputs, dtype=config.floatX) all_costs = np.zeros_like(all_outputs, dtype=config.floatX) for i in range(3*len(src_sentence)): if all_masks[-1].sum() == 0: break logprobs_lst = [] for contexts, states, search in contexts_and_states: logprobs_lst.append(search.compute_logprobs(contexts, states)) logprobs = np.sum(logprobs_lst, axis=0) next_costs = (all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]) (finished,) = np.where(all_masks[-1] == 0) next_costs[finished, :utils.EOS_ID] = np.inf next_costs[finished, utils.EOS_ID + 1:] = np.inf # The `i == 0` is required because at the first step the beam # size is effectively only 1. (indexes, outputs), chosen_costs = BeamSearch._smallest( next_costs, self.beam_size, only_first_row=i == 0) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Rearrange everything for contexts, states, search in contexts_and_states: for name in states: states[name] = states[name][indexes] states.update(search.compute_next_states(contexts, states, outputs)) all_outputs = np.vstack([all_outputs, outputs[None, :]]) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != utils.EOS_ID if i == 0: mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) all_outputs = all_outputs[1:] all_masks = all_masks[:-1] all_costs = all_costs[1:] - all_costs[:-1] result = all_outputs, all_masks, all_costs trans, costs = BeamSearch.result_to_lists(result) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]] hypo.score_breakdown[0] = [(-costs[idx],1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.beam_size return hypos
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__(self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn("Beam search is prone to fail with no log-prob normalization") language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout(lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [self.recordings, self.recordings_source, self.labels, self.labels_mask] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source) def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = {'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init} global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config(self, {'initial_states_init': self.initial_states_init}) @application def cost(self, recordings, recordings_mask, labels, labels_mask): bottom_processed = self.bottom.apply(recordings) encoded, encoded_mask = self.encoder.apply( input_=bottom_processed, mask=recordings_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix( labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, recordings): encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(recordings)) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=recordings.shape[0], batch_size=recordings.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() param_values = load_parameter_values(path) SpeechModel(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self): result = self.generate(self.recordings) return result def get_cost_graph(self, batch=True): if batch: return self.cost( self.recordings, self.recordings_mask, self.labels, self.labels_mask) recordings = self.single_recording[:, None, :] labels = self.single_transcription[:, None] return self.cost( recordings, tensor.ones_like(recordings[:, :, 0]), labels, None) def analyze(self, recording, transcription): """Compute cost and aligment for a recording/transcription pair.""" if not hasattr(self, "_analyze"): cost = self.get_cost_graph(batch=False) cg = ComputationGraph(cost) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros((self.single_transcription.shape[0], self.single_recording.shape[0]))] states, = VariableFilter( applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( [self.single_recording, self.single_transcription], [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output) return self._analyze(recording, transcription) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter( applications=[self.generator.generate], name="outputs")( ComputationGraph(generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, recording, char_discount=0.0): if not hasattr(self, '_beam_search'): self.init_beam_search(self.beam_size) input_ = recording[:,numpy.newaxis,:] outputs, search_costs = self._beam_search.search( {self.recordings: input_}, self.eos_label, input_.shape[0] / 3, ignore_first_eol=self.data_prepend_eos, char_discount=char_discount) return outputs, search_costs def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
class BleuValidator(SimpleExtension, SamplingBase): def __init__( self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, src_eos_idx=-1, trg_eos_idx=-1, **kwargs ): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.verbose = config.get("val_set_out", None) self.src_eos_idx = src_eos_idx self.trg_eos_idx = trg_eos_idx # Helpers self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.src_eos_idx # self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples) self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"] # Create saving directory if it does not exist if not os.path.exists(self.config["saveto"]): os.makedirs(self.config["saveto"]) if self.config["reload"]: try: bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz")) self.val_bleu_curve = bleu_score["bleu_scores"].tolist() # Track n best previous bleu scores for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status["iterations_done"] <= self.config["val_burn_in"]: return # Get current model parameters self.model.set_param_values(self.main_loop.model.get_param_values()) # Evaluate and save if necessary self._save_model(self._evaluate_model()) def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 # Get target vocabulary if not self.trg_ivocab: sources = self._get_attr_rec(self.main_loop, "data_stream") trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} if self.verbose: ftrans = open(self.config["val_set_out"], "w") for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ line[0][-1] = self.src_eos_idx seq = self._oov_to_unk(line[0]) input_ = numpy.tile(seq, (self.config["beam_size"], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(seq), eol_symbol=self.trg_eos_idx, ignore_first_eol=True, ) nbest_idx = numpy.argsort(costs)[: self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out[:-1], self.trg_ivocab) except ValueError: print "Can NOT find a translation for line: {}".format(i + 1) trans_out = "<UNK>" if j == 0: # Write to subprocess and file if it exists print >> mb_subprocess.stdin, trans_out if self.verbose: print >> ftrans, trans_out if i != 0 and i % 100 == 0: print "Translated {} lines of validation set...".format(i) mb_subprocess.stdin.flush() print "Total cost of the validation: {}".format(total_cost) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() print "output ", stdout out_parse = re.match(r"BLEU = [-.0-9]+", stdout) logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.0)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) print bleu_score mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter("bleu_score")).bleu_score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config["saveto"]) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter("bleu_score")) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) numpy.savez(model.path, **self.main_loop.model.get_param_values()) numpy.savez(os.path.join(self.config["saveto"], "val_bleu_scores.npz"), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
def test_beam_search_smallest(): a = numpy.array([[3, 6, 4], [1, 2, 7]]) ind, mins = BeamSearch._smallest(a, 2) assert numpy.all(numpy.array(ind) == numpy.array([[1, 1], [0, 1]])) assert numpy.all(mins == [1, 2])
class BleuValidator(SimpleExtension, SamplingBase): # TODO: a lot has been changed in NMT, sync respectively """Implements early stopping based on BLEU score.""" def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.vocab = data_stream.dataset.dictionary self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<'] # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= \ self.config['val_burn_in']: return # Evaluate and save if necessary self._save_model(self._evaluate_model()) def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 # Get target vocabulary sources = self._get_attr_rec(self.main_loop, 'data_stream') trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} trg_eos_sym = sources.data_streams[1].dataset.eos_token self.trg_eos_idx = trg_vocab[trg_eos_sym] if self.verbose: ftrans = open(self.config['val_set_out'], 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ seq = self._oov_to_unk( line[0], self.config['src_vocab_size'], self.unk_idx) input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=self.trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logger.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter('bleu_score')).bleu_score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('bleu_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) numpy.savez( model.path, **self.main_loop.model.get_parameter_dict()) numpy.savez( os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: do i really need this?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.info(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.info(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Compare with blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples)
decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info("Building sampling model") sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) params = search_model.get_parameter_dict() param_values = SaveLoadUtils().load_parameter_values(os.path.join(config['saveto'], 'params.npz')) for k in params: params[k].set_value(param_values[k]) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1])) beam_search = BeamSearch(samples=samples) # Read from standard input stream = get_stdin_stream(**config) vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'], config['unk_id'], config['eos_id'], config['bos_id']) inv_vocab = {v: k for k, v in vocab.iteritems()} unk_id = config['unk_id'] eos_id = config['eos_id'] for sample in stream.get_epoch_iterator(): seq = sample[0] input_ = np.tile(seq, (config['beam_size'], 1)) trans, costs = beam_search.search(
def main(config, tr_stream, dev_stream, use_bokeh=False, slim_iteration_state=False, switch_controller=None, reset_epoch=False): """This method largely corresponds to the ``main`` method in the original Blocks implementation in blocks-examples and most of the code is copied from there. Following modifications have been made: - Support fixing word embedding during training - Dropout fix https://github.com/mila-udem/blocks-examples/issues/46 - If necessary, add the exp3s extension Args: config (dict): NMT config tr_stream (DataStream): Training data stream dev_stream (DataStream): Validation data stream use_bokeh (bool): Whether to use bokeh for plotting slim_iteration_state (bool): Whether to store the full iteration state or only the epoch iterator without data stream state switch_controller (SourceSwitchController): Controlling strategy if monolingual data is used as well reset_epoch (bool): Set epoch_started in main loop status to false. Sometimes required if you change training parameters such as mono_data_integration """ nmt_model = NMTModel(config) nmt_model.set_up() # Set extensions logging.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([nmt_model.cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], slim_iteration_state, every_n_batches=config['save_freq']) ] # Add early stopping based on bleu if config['bleu_script'] is not None: logging.info("Building bleu validator") extensions.append( BleuValidator(nmt_model.sampling_input, samples=nmt_model.samples, config=config, model=nmt_model.search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], store_full_main_loop=config['store_full_main_loop'], every_n_batches=config['bleu_val_freq'])) if switch_controller: switch_controller.beam_search = BeamSearch(samples=nmt_model.samples) switch_controller.src_sentence = nmt_model.sampling_input extensions.append(switch_controller) # Reload model if necessary if config['reload']: extensions.append( LoadNMT(config['saveto'], slim_iteration_state, reset_epoch)) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Decoding cost', channels=[['decoder_cost_cost']], after_batch=True)) # Add an extension for correct handling of SIGTERM and SIGINT extensions.append(AlwaysEpochInterrupt(every_n_batches=1)) # Set up training algorithm logging.info("Initializing training algorithm") # https://github.com/mila-udem/blocks-examples/issues/46 train_params = nmt_model.cg.parameters # fs439: fix embeddings? if config['fix_embeddings']: train_params = [] embedding_params = [ 'softmax1', 'softmax0', 'maxout_bias', 'embeddings', 'lookuptable', 'transform_feedback' ] for p in nmt_model.cg.parameters: add_param = True for ann in p.tag.annotations: if ann.name in embedding_params: logging.info("Do not train %s due to annotation %s" % (p, ann)) add_param = False break if add_param: train_params.append(p) # Change cost=cost to cg.outputs[0] ? algorithm = GradientDescent(cost=nmt_model.cg.outputs[0] if config['dropout'] < 1.0 else nmt_model.cost, parameters=train_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logging.info("Initializing main loop") main_loop = MainLoop(model=nmt_model.training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Reset epoch if reset_epoch: main_loop.status['epoch_started'] = False # Train! main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
class BleuValidator(SimpleExtension, SamplingBase): """Implements early stopping based on BLEU score.""" def __init__(self, source_sentence, target_prefix, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.target_prefix = target_prefix self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = [ 'perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<' ] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load( os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu, key='BLEU')) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config[ 'val_burn_in']: return # Evaluate the model bleu_score = self._evaluate_model() # add an entry to the log self.main_loop.log.current_row[ 'validation_set_bleu_score'] = bleu_score # save if necessary self._save_model(bleu_score) def _evaluate_model(self): # Set in the superclass -- SamplingBase if not hasattr(self, 'target_dataset'): self._initialize_dataset_info() # self.unk_sym = self.target_dataset.unk_token # self.eos_sym = self.target_dataset.eos_token self.unk_sym = '<UNK>' self.eos_sym = '</S>' self.unk_idx = self.trg_vocab[self.unk_sym] self.eos_idx = self.trg_vocab[self.eos_sym] logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.verbose: ftrans = open(self.config['val_set_out'], 'w') print('LENGTH OF DEV STREAM: {}'.format( len(list(self.data_stream.get_epoch_iterator())))) for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ # Note that the indices of source and target in the datastream are hard-coded # currently our datastream is (source,target,prefix,suffix) seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx) target_prefix = line[2] input_ = numpy.tile(seq, (self.config['beam_size'], 1)) prefix_input_ = numpy.tile(target_prefix, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back # beam search param names come from WHERE?? trans, costs = self.beam_search.search(input_values={ self.source_sentence: input_, self.target_prefix: prefix_input_ }, max_length=3 * len(seq), eol_symbol=self.eos_idx, ignore_first_eol=False) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words #print('input_seq: {}'.format(seq)) #print('input_prefix: {}'.format(target_prefix)) #print('trans_out_raw: {}'.format(trans_out)) trans_out = self._idx_to_word(trans_out, self.trg_ivocab) #print('trans_out_text: {}'.format(trans_out)) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logger.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min( self.best_models, key=operator.attrgetter('score')).score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto'], key='BLEU') # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) SaveLoadUtils.save_parameter_values( self.main_loop.model.get_parameter_values(), model.path) numpy.savez(os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source) def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = { 'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init } global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config( self, {'initial_states_init': self.initial_states_init}) @application def cost(self, recordings, recordings_mask, labels, labels_mask): bottom_processed = self.bottom.apply(recordings) encoded, encoded_mask = self.encoder.apply(input_=bottom_processed, mask=recordings_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix(labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, recordings): encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(recordings)) encoded = self.top.apply(encoded) return self.generator.generate(n_steps=recordings.shape[0], batch_size=recordings.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() param_values = load_parameter_values(path) SpeechModel(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self): result = self.generate(self.recordings) return result def get_cost_graph(self, batch=True): if batch: return self.cost(self.recordings, self.recordings_mask, self.labels, self.labels_mask) recordings = self.single_recording[:, None, :] labels = self.single_transcription[:, None] return self.cost(recordings, tensor.ones_like(recordings[:, :, 0]), labels, None) def analyze(self, recording, transcription): """Compute cost and aligment for a recording/transcription pair.""" if not hasattr(self, "_analyze"): cost = self.get_cost_graph(batch=False) cg = ComputationGraph(cost) energies = VariableFilter(bricks=[self.generator], name="energies")(cg) energies_output = [ energies[0][:, 0, :] if energies else tensor.zeros( (self.single_transcription.shape[0], self.single_recording.shape[0])) ] states, = VariableFilter(applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter(bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( [self.single_recording, self.single_transcription], [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output) return self._analyze(recording, transcription) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter(applications=[self.generator.generate], name="outputs")(ComputationGraph( generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, recording, char_discount=0.0): if not hasattr(self, '_beam_search'): self.init_beam_search(self.beam_size) input_ = recording[:, numpy.newaxis, :] outputs, search_costs = self._beam_search.search( {self.recordings: input_}, self.eos_label, input_.shape[0] / 3, ignore_first_eol=self.data_prepend_eos, char_discount=char_discount) return outputs, search_costs def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__(self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter( criterion['name'], eos_label, num_phonemes, criterion.get('min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn("Beam search is prone to fail with no log-prob normalization") language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout(lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps') def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = {'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init} global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config(self, {'initial_states_init': self.initial_states_init}) @application def cost(self, **kwargs): # pop inputs we know about inputs_mask = kwargs.pop('inputs_mask') labels = kwargs.pop('labels') labels_mask = kwargs.pop('labels_mask') # the rest is for bottom bottom_processed = self.bottom.apply(**kwargs) encoded, encoded_mask = self.encoder.apply( input_=bottom_processed, mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix( labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, **kwargs): inputs_mask = kwargs.pop('inputs_mask') n_steps = kwargs.pop('n_steps') encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(**kwargs), mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=n_steps if n_steps is not None else self.n_steps, batch_size=encoded.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() with open(path, 'r') as src: param_values = load_parameters(src) Model(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self, use_mask=True, n_steps=None): inputs_mask = None if use_mask: inputs_mask = self.inputs_mask bottom_inputs = self.inputs return self.generate(n_steps=n_steps, inputs_mask=inputs_mask, **bottom_inputs) def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: inputs = self.inputs inputs_mask = self.inputs_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(inputs_mask=inputs_mask, labels=prediction, labels_mask=prediction_mask, **inputs) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg def analyze(self, inputs, groundtruth, prediction=None): """Compute cost and aligment.""" input_values_dict = dict(inputs) input_values_dict['groundtruth'] = groundtruth if prediction is not None: input_values_dict['prediction'] = prediction if not hasattr(self, "_analyze"): input_variables = list(self.single_inputs.values()) input_variables.append(self.single_labels.copy(name='groundtruth')) prediction_variable = tensor.lvector('prediction') if prediction is not None: input_variables.append(prediction_variable) cg = self.get_cost_graph( batch=False, prediction=prediction_variable[:, None]) else: cg = self.get_cost_graph(batch=False) cost = cg.outputs[0] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros_like(weights)] states, = VariableFilter( applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] self._analyze = theano.function( input_variables, [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output, on_unused_input='warn') return self._analyze(**input_values_dict) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter( applications=[self.generator.generate], name="outputs")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, inputs, **kwargs): # When a recognizer is unpickled, self.beam_size is available # but beam search has to be recompiled. self.init_beam_search(self.beam_size) inputs = dict(inputs) max_length = int(self.bottom.num_time_steps(**inputs) / self.max_decoded_length_scale) search_inputs = {} for var in self.inputs.values(): search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...] if inputs: raise Exception( 'Unknown inputs passed to beam search: {}'.format( inputs.keys())) outputs, search_costs = self._beam_search.search( search_inputs, self.eos_label, max_length, ignore_first_eol=self.data_prepend_eos, **kwargs) return outputs, search_costs def init_generate(self): generated = self.get_generate_graph(use_mask=False) cg = ComputationGraph(generated['outputs']) self._do_generate = cg.get_theano_function() def sample(self, inputs, n_steps=None): if not hasattr(self, '_do_generate'): self.init_generate() batch, unused_mask = self.bottom.single_to_batch_inputs(inputs) batch['n_steps'] = n_steps if n_steps is not None \ else int(self.bottom.num_time_steps(**batch) / self.max_decoded_length_scale) return self._do_generate(**batch)[0] def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, store_full_main_loop=False, **kwargs): """Creates a new extension which adds model selection based on the BLEU score to the training main loop. Args: source_sentence (Variable): Input variable to the sampling computation graph samples (Variable): Samples variable of the CG model (NMTModel): See the model module data_stream (DataStream): Data stream to the development set config (dict): NMT configuration n_best (int): beam size track_n_models (int): Number of n-best models for which to create checkpoints. normalize (boolean): Enables length normalization store_full_main_loop (boolean): Stores the iteration state in the old style of Blocks 0.1. Not recommended """ super(BleuValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.best_models = [] self.val_bleu_curve = [] self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split() logging.debug("BLEU command: %s" % self.multibleu_cmd) self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \ else FlatSparseFeatMap() if config['trg_sparse_feat_map']: self.trg_sparse_feat_map = config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logging.info("BleuScores Reloaded") except: logging.info("BleuScores not Found")
class BeamSearchEvaluator(object): def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict def evaluate(self, data_stream, train=False, file_pred=None, file_targets=None): loss = 0. num_examples = 0 iterator = data_stream.get_epoch_iterator() if train: print 'Train evaluation started' i = 0 for inputs in iterator: inputs = dict(zip(data_stream.sources, inputs)) x_mask_val = inputs['features_mask'] x_val = inputs['features'] y_val = inputs['phonemes'] y_mask_val = inputs['phonemes_mask'] for batch_ind in xrange(inputs['features'].shape[1]): if x_val.ndim == 2: input_beam = numpy.tile(x_val[:, batch_ind][:, None], (1, self.beam_size)) else: input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :], (1, self.beam_size, 1)) input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None], (1, self.beam_size)) predictions, _ = self.beam_search.search( {self.x: input_beam, self.x_mask: input_mask_beam}, self.eol_symbol, 100) predictions = [self.phoneme_dict[phone_ind] for phone_ind in predictions[0] if self.phoneme_dict[phone_ind] not in self.black_list][1:-1] targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind] targets = [self.phoneme_dict[phone_ind] for phone_ind in targets if self.phoneme_dict[phone_ind] not in self.black_list][1:-1] predictions = [x[0] for x in groupby(predictions)] targets = [x[0] for x in groupby(targets)] i += 1 if file_pred: file_pred.write(' '.join(predictions) + '(%d)\n' % i) if file_targets: file_targets.write(' '.join(targets) + '(%d)\n' %i) loss += Evaluation.wer([predictions], [targets]) num_examples += 1 print '.. found sequence example:', ' '.join(predictions) print '.. real output was: ', ' '.join(targets) if train: break if train: print 'Train evaluation finished' per = loss.sum() / num_examples return {'per': per}
class BleuValidator(SimpleExtension): """Implements early stopping based on BLEU score. This class is still very similar to the ``BleuValidator`` in the NMT Blocks example. TODO: Refactor, make this more similar to the rest of SGNMT, use vanilla_decoder.py """ def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, store_full_main_loop=False, **kwargs): """Creates a new extension which adds model selection based on the BLEU score to the training main loop. Args: source_sentence (Variable): Input variable to the sampling computation graph samples (Variable): Samples variable of the CG model (NMTModel): See the model module data_stream (DataStream): Data stream to the development set config (dict): NMT configuration n_best (int): beam size track_n_models (int): Number of n-best models for which to create checkpoints. normalize (boolean): Enables length normalization store_full_main_loop (boolean): Stores the iteration state in the old style of Blocks 0.1. Not recommended """ super(BleuValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.best_models = [] self.val_bleu_curve = [] self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split() logging.debug("BLEU command: %s" % self.multibleu_cmd) self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \ else FlatSparseFeatMap() if config['trg_sparse_feat_map']: self.trg_sparse_feat_map = config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logging.info("BleuScores Reloaded") except: logging.info("BleuScores not Found") def do(self, which_callback, *args): """Decodes the dev set and stores checkpoints in case the BLEU score has improved. """ if self.main_loop.status['iterations_done'] <= \ self.config['val_burn_in']: return self._save_model(self._evaluate_model()) def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logging.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter('bleu_score')).bleu_score < bleu_score: return True return False def save_parameter_values(self, param_values, path): ''' This method is copied from blocks.machine_translation.checkpoint ''' param_values = {name.replace("/", "-"): param for name, param in param_values.items()} numpy.savez(path, **param_values) def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logging.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('bleu_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) # fs439: introduce store_full_main_loop and # storing best_bleu_params_* files if self.store_full_main_loop: logging.info("Saving full main loop model {}".format(model.path)) numpy.savez(model.path, **self.main_loop.model.get_parameter_dict()) else: logging.info("Saving model parameters {}".format(model.path)) params_to_save = self.main_loop.model.get_parameter_values() self.save_parameter_values(params_to_save, model.path) numpy.savez( os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class BlocksNMTVanillaDecoder(Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. Note that this decoder supports sparse feat maps on both source and target side. """ def __init__(self, nmt_model_path, config, decoder_args): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration decoder_args (object): Decoder configuration passed through from configuration API. """ super(BlocksNMTVanillaDecoder, self).__init__(decoder_args) self.config = config self.set_up_decoder(nmt_model_path) self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_model = NMTModel(self.config) self.nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, self.config['saveto'], self.nmt_model.search_model) loader.load_weights() self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \ if self.config['src_sparse_feat_map'] else FlatSparseFeatMap() if self.config['trg_sparse_feat_map']: self.trg_sparse_feat_map = self.config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=self.nmt_model.samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=self.nmt_model.samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3 * len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def has_predictors(self): """Always returns true. """ return True
class BeamSearchEvaluator(object): def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None, language_model=False): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size if language_model: lm = TrigramLanguageModel() ind_to_word = dict(enumerate(lm.unigrams)) self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size, samples) else: self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict def evaluate(self, data_stream, train=False, file_pred=None, file_targets=None): loss = 0. num_examples = 0 iterator = data_stream.get_epoch_iterator() if train: print 'Train evaluation started' i = 0 for inputs in iterator: inputs = dict(zip(data_stream.sources, inputs)) x_mask_val = inputs['features_mask'] x_val = inputs['features'] y_val = inputs['phonemes'] y_mask_val = inputs['phonemes_mask'] for batch_ind in xrange(inputs['features'].shape[1]): if x_val.ndim == 2: input_beam = numpy.tile(x_val[:, batch_ind][:, None], (1, self.beam_size)) else: input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :], (1, self.beam_size, 1)) input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None], (1, self.beam_size)) predictions, _ = self.beam_search.search( { self.x: input_beam, self.x_mask: input_mask_beam }, self.eol_symbol, 100) predictions = [ self.phoneme_dict[phone_ind] for phone_ind in predictions[0] if self.phoneme_dict[phone_ind] not in self.black_list ][1:-1] targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind] targets = [ self.phoneme_dict[phone_ind] for phone_ind in targets if self.phoneme_dict[phone_ind] not in self.black_list ][1:-1] predictions = [x[0] for x in groupby(predictions)] targets = [x[0] for x in groupby(targets)] i += 1 if file_pred: file_pred.write(' '.join(predictions) + '(%d)\n' % i) if file_targets: file_targets.write(' '.join(targets) + '(%d)\n' % i) loss += Evaluation.wer([predictions], [targets]) num_examples += 1 print '.. found sequence example:', ' '.join(predictions) print '.. real output was: ', ' '.join(targets) if train: break if train: print 'Train evaluation finished' per = loss.sum() / num_examples return {'per': per}
class BlocksVanillaDecoder(cam.sgnmt.decoding.core.Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. """ def __init__(self, nmt_model_path, config): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration """ super(BlocksVanillaDecoder, self).__init__() self.config = config self.set_up_decoder(nmt_model_path) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: do i really need this?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.info(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.info(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Compare with blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self._oov_to_unk(src_sentence, self.config['src_vocab_size'], utils.UNK_ID) + [utils.EOS_ID] input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def _oov_to_unk(self, seq, vocab_size, unk_idx): return [x if x < vocab_size else unk_idx for x in seq] def has_predictors(self): """Always returns true. """ return True
class BleuEvaluator(SimpleExtension, SamplingBase): def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config, val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs): # TODO: change config structure super(BleuEvaluator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.normalize = normalize self.val_out = val_out self.val_best_out = val_out and val_best_out self.bleu_scores = [] self.trg_ivocab = None self.unk_id = config['unk_id'] self.eos_id = config['eos_id'] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<'] def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config['val_burn_in']: return self._evaluate_model() def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.trg_ivocab is None: sources = self._get_attr_rec(self.main_loop, 'data_stream') trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} if self.val_out: output_file = open(self.val_out, 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_id) input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(seq), eol_symbol=self.eos_id, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # keeping eos tokens reduces BLEU score if self.config['remove_eos']: trans_out = [idx for idx in trans_out if idx != self.eos_id] # however keeping unk tokens might be a good idea (avoids brevity penalty) if self.config['remove_unk']: trans_out = [idx for idx in trans_out if idx != self.unk_id] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info("Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.val_out: print(trans_out, file=output_file) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.val_out: output_file.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) logger.info(bleu_score) mb_subprocess.terminate() self.bleu_scores.append(bleu_score) if self.val_best_out and bleu_score == max(self.bleu_scores): shutil.copy(self.val_out, self.val_best_out) return bleu_score