def _evaluate_model(self, no_epochs_done): logger.info("Started Validation.") val_start_time = time.time() error_count = 0 total_count = 0 # Get target vocabulary. if not self.trg_ivocab: # Load dictionaries and ensure special tokens exist. self.src_vocab = _ensure_special_tokens( cPickle.load(open(self.config['src_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) self.trg_vocab = _ensure_special_tokens( cPickle.load(open(self.config['trg_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) self.src_ivocab = {v: k for k, v in self.src_vocab.items()} self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.unk_idx = self.src_vocab[self.unk_sym] self.eos_idx = self.src_vocab[self.eos_sym] for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file. """ seq = line[0] input_ = numpy.tile(seq, (1, 1)) seq2 = line[2] target_ = numpy.tile(seq2, (1, 1)) batch_size = input_.shape[0] for j in range(batch_size): input_length = get_true_length(input_[j], self.src_vocab) target_length = get_true_length(target_[j], self.trg_vocab) inp = input_[j, :input_length] _1, outputs, _2, _3, costs, _4 = (self.sampling_fn(inp[None, :])) outputs = outputs.flatten() sample_length = get_true_length(outputs, self.trg_vocab) input_word = _idx_to_word(input_[j][:input_length], self.src_ivocab) target_word = _idx_to_word(target_[j][:target_length], self.trg_ivocab) predicted_word = _idx_to_word(outputs[:sample_length], self.trg_ivocab) if target_word != predicted_word: error_count += 1 total_count += 1 new_accuracy = (total_count - error_count)*1.0 / total_count self._save_model(new_accuracy) res_file = open(self.validation_results_file, 'a') res_file.write(str(no_epochs_done) + '\t' + str(new_accuracy) + '\t' + str(error_count) + '\t' + str(total_count) + '\n') res_file.close() logger.info("Validation finished. Current accuracy on dev set: " + str(new_accuracy))
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, monitor_attention=False, the_track=None): resulting_predictions = {} pred_with_prob = {} # for the bootstrapping for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') bin_model = main_loop.model sources = [inp.name for inp in bin_model.inputs] # = 'input' theinputs = [inp for inp in bin_model.inputs] decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0] sampling_fn = bin_model.get_theano_function() # the function to call for prediction # Load vocabularies and invert if necessary # WARNING: Source and target indices from data stream # can be different print('*') print(config['src_vocab']) # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( cPickle.load(open(config['src_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) src_ivocab = {v: k for k, v in src_vocab.items()} trg_ivocab = {v: k for k, v in trg_vocab.items()} epoch_iter = data_to_predict_stream.get_epoch_iterator() counter = 0 for anIter in epoch_iter: input_=anIter[0] if counter%500 == 0: print(counter) counter += 1 inp = input_ the_one = False _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # Combining the probabilities for the sinngle letters to get a total probability. prob_word = reduce(mul, output_probs, 1)[0] outputs = outputs.flatten() sample_length = get_true_length(outputs, trg_vocab) input_word = _idx_to_word(input_, src_ivocab) predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab) help_word_input = u'' help_word_result = u'' help_tag = u'' resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]) pred_with_prob[(u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1]), u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]))] = prob_word results_path = 'results/__' if 'paper' in config['saveto']: results_path += 'paper_version/' else: results_path += '__'.join(config['saveto'].split('task')[0].split('/')) + '/track' + str(the_track) + '/' if config['the_task'] == 3: if et_version: cPickle.dump(resulting_predictions, open('ETS_' + lang + '_task3/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) else: cPickle.dump(resulting_predictions, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) else: cPickle.dump(resulting_predictions, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) print('prediction done') print('INFO: storing intermediate results to: ' + results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl') print('Storing the dictionary with the most likely solutions to ' + config['saveto'] + 'test_sol.pkl') cPickle.dump(pred_with_prob, open(config['saveto'] + '/test_sol.pkl', 'wb')) print(pred_with_prob)
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, monitor_attention=False, the_track=None): resulting_predictions = {} pred_with_prob = {} # for the bootstrapping for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') bin_model = main_loop.model sources = [inp.name for inp in bin_model.inputs] # = 'input' theinputs = [inp for inp in bin_model.inputs] decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0] sampling_fn = bin_model.get_theano_function() # the function to call for prediction # Load vocabularies and invert if necessary # WARNING: Source and target indices from data stream # can be different print('*') print(config['src_vocab']) # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( cPickle.load(open(config['src_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) src_ivocab = {v: k for k, v in src_vocab.items()} trg_ivocab = {v: k for k, v in trg_vocab.items()} epoch_iter = data_to_predict_stream.get_epoch_iterator() counter = 0 right_results = 0 total = 0 edit_distance = 0 for anIter in epoch_iter: input_=anIter[0] gold_output = anIter[1] if counter%500 == 0: print(counter) counter += 1 inp = input_ orig_input_ = input_ the_one = False # Convert the Spanish lemma to Portuguese. Take <UNK> tags out. inp_word = _idx_to_word(input_, src_ivocab) tag_for_infl = [u'LANG_IN=portuguese', u'LANG_OUT=portuguese'] tag_for_back = [u'LANG_IN=portuguese', u'LANG_OUT=spanish'] new_inp_word = [] for sth in inp_word.split(u' '): if u'LANG_OUT=spanish'==sth: new_inp_word.append(u'LANG_OUT=portuguese') else: if sth != u'<UNK>': if u'LANG_IN' not in sth and len(sth) > 1 and u'</S>' not in sth and u'<S>' not in sth: tag_for_infl.append(sth) #tag_for_back.append(sth) if u'=' not in sth or u'LANG_IN' in sth: new_inp_word.append(sth) inp_word = u' '.join(new_inp_word) print('Input 1: ' + inp_word) #print(src_vocab) #exit() inp = _word_to_idx(inp_word.split(u' '), src_vocab) input_ = inp _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # converting Spanish to Portuguese # Combining the probabilities for the sinngle letters to get a total probability. prob_word = reduce(mul, output_probs, 1)[0] outputs = outputs.flatten() sample_length = get_true_length(outputs, trg_vocab) # Convert the output to a Portuguese inflection sample. inp_word = _idx_to_word(outputs[:sample_length], trg_ivocab) inp_word = inp_word.split(u' ')[0] + u' ' + u' '.join(tag_for_infl) + u' ' + u' '.join(inp_word.split(u' ')[1:]) inp = _word_to_idx(inp_word.split(u' '), src_vocab) input_ = inp print('Input 2: ' + inp_word) _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # inflecting the Portuguese word # Combining the probabilities for the sinngle letters to get a total probability. prob_word = reduce(mul, output_probs, 1)[0] outputs = outputs.flatten() sample_length = get_true_length(outputs, trg_vocab) # Convert the output to Spanish. inp_word = _idx_to_word(outputs[:sample_length], trg_ivocab) inp_word = inp_word.split(u' ')[0] + u' ' + u' '.join(tag_for_back) + u' ' + u' '.join(inp_word.split(u' ')[1:]) inp = _word_to_idx(inp_word.split(u' '), src_vocab) input_ = inp print('Input 3: ' + inp_word) _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # converting to Spanish # Combining the probabilities for the sinngle letters to get a total probability. prob_word = reduce(mul, output_probs, 1)[0] outputs = outputs.flatten() sample_length = get_true_length(outputs, trg_vocab) input_word = _idx_to_word(orig_input_, src_ivocab) #print(input_word) #exit() predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab) gold_word = _idx_to_word(gold_output, trg_ivocab) if predicted_word == gold_word: right_results += 1 else: edit_distance += editdistance.eval(predicted_word, gold_word) if u'OUT=V' in input_word: total += 1 print(input_word) print(predicted_word) print(gold_word) print('') #exit() #help_word_input = u'' #help_word_result = u'' #help_tag = u'' #resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]) #pred_with_prob[(u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1]), u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]))] = prob_word results_path = 'results/__' if False: #if 'paper' in config['saveto']: results_path += 'paper_version/' else: results_path += '__'.join(config['saveto'].split('task')[0].split('/')) + '/track' + str(the_track) + '/' res_ed = edit_distance*1.0/total res_accuracy = 1.0*right_results/total if config['the_task'] == 3: if et_version: cPickle.dump(res_accuracy, open('ETS_' + lang + '_task3/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) else: cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) else: cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'ab')) print('prediction done') print('INFO: storing results to: ' + results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl') #print('Storing the dictionary with the most likely solutions to ' + config['saveto'] + 'test_sol.pkl') #cPickle.dump(pred_with_prob, open(config['saveto'] + '/test_sol.pkl', 'wb')) print(str(res_accuracy)) print(str(res_ed))
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['topical_embedding_dim']) topical_transformer = topicalq_transformer(config['topical_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representation = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] cost = decoder.cost(representation, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, topic_embedding, content_embedding) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='warn', step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['topical_test_set'], config['topical_vocab'], config['topical_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding = topical_transformer.apply(source_topical_word) representation = encoder.apply(source_sentence, tensor.ones(source_sentence.shape)) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] generated = decoder.generate(source_sentence, representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq2 = line[1] input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical = numpy.tile(seq2, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best = numpy.argsort(costs)[0:config['beam_size']] for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data'] = config['val_set'] config['trg_data'] = config['val_set_grndtruth'] config['batch_size'] = 1 config['sort_k_batches'] = 1 test_stream = get_tr_stream_unsorted(**config) logger.info("Building sampling model") representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([ source_sentence, source_sentence_mask, target_sentence, target_sentence_mask ], costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost) + "\n") ftrans.close()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder') decoder = Decoder(vocab_size=config['trg_vocab_size'], embedding_dim=config['dec_embed'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') dev_source = tensor.lmatrix('dev_source') dev_target=tensor.lmatrix('dev_target') # Get training and development set streams tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_with_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (perplexity)) cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init =decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init =decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, model_name=config['model_name'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if False: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'], n_best=3, track_n_models=6)) logger.info("Building perplexity validator") extensions.append( pplValidation(dev_source,dev_target, config=config, model=costs_computer, data_stream=dev_stream, model_name=config['model_name'], every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])()])) _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'ppl': # Create Theano variables # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') # Get training and development set streams #tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_withContext_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (cost)) logger.info("Loading the model..") model = Model(cost) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started Validation: ") ts = dev_stream.get_epoch_iterator() total_cost = 0.0 total_tokens=0.0 #pbar = ProgressBar(max_value=len(ts)).start()#modified pbar = ProgressBar(max_value=10000).start(); for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts): costs = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask]) cost = costs.sum() total_cost+=cost total_tokens+=trg_mask.sum() pbar.update(i + 1) total_cost/=total_tokens; pbar.finish() #dev_stream.reset() # run afterprocess # self.ap.main() total_cost=2**total_cost; print("Average validation cost: " + str(total_cost)); elif mode == 'translate': logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} config['batch_size'] = 1 sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); generated = decoder.generate(sentence_representations_list,sentence_masks_list) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_withContext(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw[3].split() seqs=[]; input_=[]; input_mask=[]; for j in range(config['ctx_num']+1): seqs.append(sutils._oov_to_unk( line[2*j][0], config['src_vocab_size'], unk_idx)) input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1))) input_.append(numpy.tile(seqs[j], (config['beam_size'], 1))) #v=costs_computer(input_[0]); # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3], context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0], context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1], context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]}, max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths b = numpy.argsort(costs)[0] #best=numpy.argsort(costs)[0:config['beam_size']]; #for b in best: try: total_cost += costs[b] trans_out = trans[b] totalLen=4*len(line[0][0]); #weight = weights[b][:, :totalLen] weight=weights trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() ap = afterprocesser(config) ap.main()
def _evaluate_model(self, no_epochs_done): logger.info("Started Validation.") val_start_time = time.time() error_count = 0 total_count = 0 # Get target vocabulary if not self.trg_ivocab: # Load dictionaries and ensure special tokens exist self.src_vocab = _ensure_special_tokens(cPickle.load( open(self.config['src_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) self.trg_vocab = _ensure_special_tokens(cPickle.load( open(self.config['trg_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) self.src_ivocab = {v: k for k, v in self.src_vocab.items()} self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.unk_idx = self.src_vocab[self.unk_sym] self.eos_idx = self.src_vocab[self.eos_sym] ''' print('length data_stream:') print(len(self.data_stream)) sys.exit(0) ''' ana_file = io.open( self.validation_results_file + '_det', 'w', encoding='utf-8') # this should always contain the last for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ #print(line) #sys.exit(0) seq = line[ 0] #self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx) input_ = numpy.tile(seq, (1, 1)) seq2 = line[2] target_ = numpy.tile(seq2, (1, 1)) #print('target') #print(target_) #print('input') #print(input_) batch_size = input_.shape[0] #print('batch_size:') #print(batch_size) #sys.exit(0) #src_batch = anIter[main_loop.data_stream.mask_sources[0]] #input_ = src_batch[:, :] for j in range(batch_size): input_length = get_true_length(input_[j], self.src_vocab) target_length = get_true_length(target_[j], self.trg_vocab) inp = input_[j, :input_length] _1, outputs, _2, _3, costs, _4 = (self.sampling_fn( inp[None, :])) outputs = outputs.flatten() sample_length = get_true_length(outputs, self.trg_vocab) # NOTE: There are two types of lost words. One is because the label has not been seen in the training data (rare). # The other reason is unknown. However, the word is lost at this point already. input_word = _idx_to_word(input_[j][:input_length], self.src_ivocab) #print(input_word) target_word = _idx_to_word(target_[j][:target_length], self.trg_ivocab) #print(target_word) predicted_word = _idx_to_word(outputs[:sample_length], self.trg_ivocab) #print(predicted_word) #sys.exit(0) # TODO: get target word ana_file.write(input_word + ' / ' + target_word + '\n') ana_file.write(predicted_word + '\n\n') if target_word != predicted_word: error_count += 1 total_count += 1 new_accuracy = (total_count - error_count) * 1.0 / total_count self._save_model(new_accuracy, no_epochs_done) res_file = open(self.validation_results_file, 'a') res_file.write( str(no_epochs_done) + '\t' + str(new_accuracy) + '\t' + str(error_count) + '\t' + str(total_count) + '\t(Best: ' + str(self.best_epoch) + ')\n') res_file.close() ana_file.close() logger.info("Validation finished. Current accuracy on dev set: " + str(new_accuracy))
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) topical_transformer = topicalq_transformer( config['source_topic_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) decoder = Decoder(vocab_size=config['trg_vocab_size'], topicWord_size=config['trg_topic_vocab_size'], embedding_dim=config['dec_embed'], topical_dim=config['topical_embedding_dim'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask'); sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') topic_embedding = topical_transformer.apply(source_topical_word) # Get training and development set streams tr_stream = get_tr_stream_with_topic_target(**config) #dev_stream = get_dev_tr_stream_with_topic_target(**config) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representations[0, :, (representations.shape[2] / 2):] cost = decoder.cost(representations, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding, content_embedding) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' cg = ComputationGraph(cost) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (perplexity), on_unused_input='ignore') # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # # Set up beam search and sampling computation graphs if necessary # if config['hook_samples'] >= 1 or config['bleu_script'] is not None: # logger.info("Building sampling model") # sampling_representation = encoder.apply( # sampling_input, tensor.ones(sampling_input.shape)) # generated = decoder.generate( # sampling_input, sampling_representation) # search_model = Model(generated) # _, samples = VariableFilter( # bricks=[decoder.sequence_generator], name="outputs")( # ComputationGraph(generated[1])) # # # Add sampling # if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # model_name=config['model_name'], # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # # # Add early stopping based on bleu # if False: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'], # n_best=3, # track_n_models=6)) # # logger.info("Building perplexity validator") # extensions.append( # pplValidation( config=config, # model=costs_computer, data_stream=dev_stream, # model_name=config['model_name'], # every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='ignore') _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap') tw_vocab_overlap_matrix = cPickle.load( open(config['tw_vocab_overlap'], 'rb')) tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix, dtype='int32') #tw_vocab_overlap=shared(tw_vocab_overlap_matrix); topic_embedding = topical_transformer.apply(source_topical_word) sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens(cPickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) topic_embedding = topical_transformer.apply(source_topical_word) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = sampling_representation[0, :, ( sampling_representation.shape[2] / 2):] generated = decoder.generate(sampling_input, sampling_representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_with_topicalq(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw.split() seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq1 = line[1] input_topical = numpy.tile(seq1, (config['beam_size'], 1)) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix}, tw_vocab_overlap=tw_vocab_overlap_matrix, max_length=3*len(seq), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] weight = weights[best][:, :len(trans_in)] trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() # ap = afterprocesser(config) # ap.main() elif mode == 'score': logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') source_topical_word = tensor.lmatrix('source_topical') topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding) config['batch_size'] = 1 config['sort_k_batches'] = 1 # Get test set stream test_stream = get_tr_stream_with_topic_target(**config) logger.info("Building sampling model") logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (costs), on_unused_input='ignore') iterator = test_stream.get_epoch_iterator() scores = [] att_weights = [] for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb, tb_mask) in enumerate(iterator): costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb]) cost = costs.sum() print(i, cost) scores.append(cost) print(sum(scores) / 10007)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2,config['topical_embedding_dim']) topical_transformer=topicalq_transformer(config['topical_vocab_size'],config['topical_embedding_dim'], config['enc_nhids'],config['topical_word_num'],config['batch_size']); if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word=tensor.lmatrix('source_topical') source_topical_mask=tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding=topical_transformer.apply(source_topical_word); # Get cost of the model representation=encoder.apply(source_sentence, source_sentence_mask); tw_representation=topical_transformer.look_up.apply(source_topical_word.T); content_embedding=representation[0,:,(representation.shape[2]/2):]; cost = decoder.cost( representation,source_sentence_mask,tw_representation, source_topical_mask, target_sentence, target_sentence_mask,topic_embedding,content_embedding); logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init=IsotropicGaussian( config['weight_scale']); topical_transformer.biases_init=Constant(0); topical_transformer.push_allocation_config();#don't know whether the initialize is for topical_transformer.look_up.weights_init=Orthogonal(); topical_transformer.transformer.weights_init=Orthogonal(); topical_transformer.initialize(); word_topical_embedding=cPickle.load(open(config['topical_embeddings'], 'rb')); np_word_topical_embedding=numpy.array(word_topical_embedding,dtype='float32'); topical_transformer.look_up.W.set_value(np_word_topical_embedding); topical_transformer.look_up.W.tag.role=[]; # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters,on_unused_sources='warn', step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word=tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'],config['topical_test_set'],config['topical_vocab'],config['topical_vocab_size'],config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding=topical_transformer.apply(source_topical_word); representation=encoder.apply(source_sentence, tensor.ones(source_sentence.shape)); tw_representation=topical_transformer.look_up.apply(source_topical_word.T); content_embedding=representation[0,:,(representation.shape[2]/2):]; generated = decoder.generate(source_sentence,representation, tw_representation,topical_embedding=topic_embedding,content_embedding=content_embedding); _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) seq2 = line[1]; input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical=numpy.tile(seq2,(config['beam_size'],1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best=numpy.argsort(costs)[0:config['beam_size']]; for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data']=config['val_set'] config['trg_data']=config['val_set_grndtruth'] config['batch_size']=1; config['sort_k_batches']=1; test_stream=get_tr_stream_unsorted(**config); logger.info("Building sampling model") representations= encoder.apply( source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([source_sentence,source_sentence_mask, target_sentence, target_sentence_mask],costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost)+"\n"); ftrans.close();
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, the_task, et_version, config, monitor_attention=False): resulting_predictions = {} for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') bin_model = main_loop.model sources = [inp.name for inp in bin_model.inputs] theinputs = [inp for inp in bin_model.inputs] decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0] # The function to call for prediction. sampling_fn = bin_model.get_theano_function() # Load vocabularies and invert if necessary # WARNING: Source and target indices from data stream can be different # Load dictionaries and ensure special tokens exist. src_vocab = _ensure_special_tokens( cPickle.load(open(config['src_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'])), bos_idx=0, eos_idx=2, unk_idx=1) src_ivocab = {v: k for k, v in src_vocab.items()} trg_ivocab = {v: k for k, v in trg_vocab.items()} epoch_iter = data_to_predict_stream.get_epoch_iterator() counter = 0 for anIter in epoch_iter: input_=anIter[0] if counter%500 == 0: logger.info(str(counter) + ' samples predicted') counter += 1 inp = input_ the_one = False _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # Combining the probabilities for the single letters to get a total probability. prob_word = reduce(mul, output_probs, 1)[0] outputs = outputs.flatten() sample_length = get_true_length(outputs, trg_vocab) input_word = _idx_to_word(input_, src_ivocab) predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab) help_word_input = u'' help_word_result = u'' help_tag = u'' resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]) results_path = 'results/' + config['saveto'].split('task')[0] + 'task' + str(the_task) + '/' + lang + '/' cPickle.dump(resulting_predictions, open(results_path + 'Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb')) logger.info('Prediction done. Storing intermediate results to: ' + results_path + 'Ens_' + str(use_ensemble) + '_intermediate_results.pkl')