コード例 #1
0
    def _evaluate_model(self, no_epochs_done):
        logger.info("Started Validation.")
        val_start_time = time.time()
        error_count = 0
        total_count = 0

        # Get target vocabulary.
        if not self.trg_ivocab:
            # Load dictionaries and ensure special tokens exist.
            self.src_vocab = _ensure_special_tokens(
                cPickle.load(open(self.config['src_vocab'])),
                bos_idx=0, eos_idx=2, unk_idx=1)
            self.trg_vocab = _ensure_special_tokens(
                cPickle.load(open(self.config['trg_vocab'])),
                bos_idx=0, eos_idx=2, unk_idx=1)
            self.src_ivocab = {v: k for k, v in self.src_vocab.items()}
            self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}
            self.unk_idx = self.src_vocab[self.unk_sym]
            self.eos_idx = self.src_vocab[self.eos_sym]
      
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file.
            """
            seq = line[0]
            input_ = numpy.tile(seq, (1, 1))
            
            seq2 = line[2]
            target_ = numpy.tile(seq2, (1, 1))

            batch_size = input_.shape[0]

            for j in range(batch_size):
        
              input_length = get_true_length(input_[j], self.src_vocab)
              target_length = get_true_length(target_[j], self.trg_vocab)
        
              inp = input_[j, :input_length]
              _1, outputs, _2, _3, costs, _4 = (self.sampling_fn(inp[None, :]))
              outputs = outputs.flatten()
              sample_length = get_true_length(outputs, self.trg_vocab)
        
              input_word = _idx_to_word(input_[j][:input_length], self.src_ivocab)
              target_word = _idx_to_word(target_[j][:target_length], self.trg_ivocab)
              predicted_word = _idx_to_word(outputs[:sample_length], self.trg_ivocab)
              if target_word != predicted_word:
                error_count += 1
              total_count += 1
        
        new_accuracy = (total_count - error_count)*1.0 / total_count
        self._save_model(new_accuracy)

        res_file = open(self.validation_results_file, 'a')
        res_file.write(str(no_epochs_done) + '\t' + str(new_accuracy) + '\t' + str(error_count) + '\t' + str(total_count) + '\n')
        res_file.close()
        logger.info("Validation finished. Current accuracy on dev set: " + str(new_accuracy))
コード例 #2
0
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, monitor_attention=False, the_track=None):
    resulting_predictions = {}
    pred_with_prob = {} # for the bootstrapping
    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
    bin_model = main_loop.model
    sources = [inp.name for inp in bin_model.inputs] # = 'input'
    theinputs = [inp for inp in bin_model.inputs]
    decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0]
    
    sampling_fn = bin_model.get_theano_function() # the function to call for prediction

    # Load vocabularies and invert if necessary
    # WARNING: Source and target indices from data stream
    #  can be different
    print('*')
    print(config['src_vocab'])
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        cPickle.load(open(config['src_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    trg_vocab = _ensure_special_tokens(
        cPickle.load(open(config['trg_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    src_ivocab = {v: k for k, v in src_vocab.items()}
    trg_ivocab = {v: k for k, v in trg_vocab.items()}
    
    epoch_iter = data_to_predict_stream.get_epoch_iterator()

    counter = 0
    
    for anIter in epoch_iter:
      
      input_=anIter[0]
      
      if counter%500 == 0:
        print(counter)
      counter += 1
      
      inp = input_

      the_one = False

      _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp]))
     
      # Combining the probabilities for the sinngle letters to get a total probability.
      prob_word = reduce(mul, output_probs, 1)[0]
      
      outputs = outputs.flatten()
      sample_length = get_true_length(outputs, trg_vocab)
       
      input_word = _idx_to_word(input_, src_ivocab)
      predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab)
 
      help_word_input = u''
      help_word_result = u''
      help_tag = u''

      resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1])
      pred_with_prob[(u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1]), u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]))] = prob_word

    results_path = 'results/__'
    if 'paper' in config['saveto']:
      results_path += 'paper_version/'
    else:
      results_path += '__'.join(config['saveto'].split('task')[0].split('/')) + '/track' + str(the_track) + '/'

    if config['the_task'] == 3:
      if et_version:
        cPickle.dump(resulting_predictions, open('ETS_' + lang + '_task3/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
      else:
        cPickle.dump(resulting_predictions, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
    else:
      cPickle.dump(resulting_predictions, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
    print('prediction done')
    print('INFO: storing intermediate results to: ' + results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl')

    print('Storing the dictionary with the most likely solutions to ' + config['saveto'] + 'test_sol.pkl')
    cPickle.dump(pred_with_prob, open(config['saveto'] + '/test_sol.pkl', 'wb'))
    print(pred_with_prob)
コード例 #3
0
ファイル: __init__.py プロジェクト: oncebasun/seq2seq-theano
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, monitor_attention=False, the_track=None):
    resulting_predictions = {}
    pred_with_prob = {} # for the bootstrapping
    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
    bin_model = main_loop.model
    sources = [inp.name for inp in bin_model.inputs] # = 'input'
    theinputs = [inp for inp in bin_model.inputs]
    decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0]
    
    sampling_fn = bin_model.get_theano_function() # the function to call for prediction

    # Load vocabularies and invert if necessary
    # WARNING: Source and target indices from data stream
    #  can be different
    print('*')
    print(config['src_vocab'])
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        cPickle.load(open(config['src_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    trg_vocab = _ensure_special_tokens(
        cPickle.load(open(config['trg_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    src_ivocab = {v: k for k, v in src_vocab.items()}
    trg_ivocab = {v: k for k, v in trg_vocab.items()}
    
    epoch_iter = data_to_predict_stream.get_epoch_iterator()

    counter = 0
    right_results = 0
    total = 0
    edit_distance = 0

    for anIter in epoch_iter:
      
      input_=anIter[0]
      gold_output = anIter[1]
      
      if counter%500 == 0:
        print(counter)
      counter += 1
      
      inp = input_
      orig_input_ = input_

      the_one = False

      # Convert the Spanish lemma to Portuguese. Take <UNK> tags out.
      inp_word = _idx_to_word(input_, src_ivocab) 
      tag_for_infl = [u'LANG_IN=portuguese', u'LANG_OUT=portuguese']
      tag_for_back = [u'LANG_IN=portuguese', u'LANG_OUT=spanish']
      new_inp_word = []
      for sth in inp_word.split(u' '):
        if u'LANG_OUT=spanish'==sth:
          new_inp_word.append(u'LANG_OUT=portuguese')
        else:
          if sth != u'<UNK>':
            if u'LANG_IN' not in sth and len(sth) > 1 and u'</S>' not in sth and u'<S>' not in sth:
              tag_for_infl.append(sth)
              #tag_for_back.append(sth)
            if u'=' not in sth or u'LANG_IN' in sth:
              new_inp_word.append(sth)
      inp_word = u' '.join(new_inp_word)
      print('Input 1: ' + inp_word)
      #print(src_vocab)
      #exit()
      inp = _word_to_idx(inp_word.split(u' '), src_vocab)
      input_ = inp    

      _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp])) # converting Spanish to Portuguese     

      # Combining the probabilities for the sinngle letters to get a total probability.
      prob_word = reduce(mul, output_probs, 1)[0]
      
      outputs = outputs.flatten()
      sample_length = get_true_length(outputs, trg_vocab)
       
      # Convert the output to a Portuguese inflection sample.
      inp_word = _idx_to_word(outputs[:sample_length], trg_ivocab) 
      inp_word = inp_word.split(u' ')[0] + u' ' + u' '.join(tag_for_infl) + u' ' + u' '.join(inp_word.split(u' ')[1:])
      inp = _word_to_idx(inp_word.split(u' '), src_vocab)
      input_ = inp    

      print('Input 2: ' + inp_word)
      _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp]))  # inflecting the Portuguese word 

      # Combining the probabilities for the sinngle letters to get a total probability.
      prob_word = reduce(mul, output_probs, 1)[0]
      
      outputs = outputs.flatten()
      sample_length = get_true_length(outputs, trg_vocab)

      # Convert the output to Spanish.
      inp_word = _idx_to_word(outputs[:sample_length], trg_ivocab) 
      inp_word = inp_word.split(u' ')[0] + u' ' + u' '.join(tag_for_back) + u' ' + u' '.join(inp_word.split(u' ')[1:])
      inp = _word_to_idx(inp_word.split(u' '), src_vocab)
      input_ = inp    
      print('Input 3: ' + inp_word)
      _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp]))  # converting to Spanish

      # Combining the probabilities for the sinngle letters to get a total probability.
      prob_word = reduce(mul, output_probs, 1)[0]
      
      outputs = outputs.flatten()
      sample_length = get_true_length(outputs, trg_vocab)

      input_word = _idx_to_word(orig_input_, src_ivocab)
      #print(input_word)
      #exit()
      predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab)
      gold_word = _idx_to_word(gold_output, trg_ivocab)

      if predicted_word == gold_word:
        right_results += 1
      else:
        edit_distance +=  editdistance.eval(predicted_word, gold_word)
      if u'OUT=V' in input_word:
        total += 1
      print(input_word)
      print(predicted_word)
      print(gold_word)
      print('')
      #exit()
      #help_word_input = u''
      #help_word_result = u''
      #help_tag = u''

      #resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1])
      #pred_with_prob[(u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1]), u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1]))] = prob_word

    results_path = 'results/__'
    if False:
    #if 'paper' in config['saveto']:
      results_path += 'paper_version/'
    else:
      results_path += '__'.join(config['saveto'].split('task')[0].split('/')) + '/track' + str(the_track) + '/'

    res_ed = edit_distance*1.0/total
    res_accuracy = 1.0*right_results/total
    if config['the_task'] == 3:
      if et_version:
        cPickle.dump(res_accuracy, open('ETS_' + lang + '_task3/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
      else:
        cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
    else:
      cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
      cPickle.dump(res_accuracy, open(results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'ab'))
    print('prediction done')
    print('INFO: storing results to: ' + results_path + 'task' + str(config['the_task']) + '/' + lang + '/Ens_' + str(use_ensemble) + '_intermediate_results.pkl')

    #print('Storing the dictionary with the most likely solutions to ' + config['saveto'] + 'test_sol.pkl')
    #cPickle.dump(pred_with_prob, open(config['saveto'] + '/test_sol.pkl', 'wb'))
    print(str(res_accuracy))
    print(str(res_ed))
コード例 #4
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['topical_embedding_dim'])
    topical_transformer = topicalq_transformer(config['topical_vocab_size'],
                                               config['topical_embedding_dim'],
                                               config['enc_nhids'],
                                               config['topical_word_num'],
                                               config['batch_size'])

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
        source_topical_word = tensor.lmatrix('source_topical')
        source_topical_mask = tensor.matrix('source_topical_mask')

        # Get training and development set streams
        tr_stream = get_tr_stream_with_topicalq(**config)
        dev_stream = get_dev_stream_with_topicalq(**config)
        topic_embedding = topical_transformer.apply(source_topical_word)
        # Get cost of the model
        representation = encoder.apply(source_sentence, source_sentence_mask)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = representation[0, :,
                                           (representation.shape[2] / 2):]

        cost = decoder.cost(representation, source_sentence_mask,
                            tw_representation, source_topical_mask,
                            target_sentence, target_sentence_mask,
                            topic_embedding, content_embedding)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()
        topical_transformer.weights_init = IsotropicGaussian(
            config['weight_scale'])
        topical_transformer.biases_init = Constant(0)
        topical_transformer.push_allocation_config()
        #don't know whether the initialize is for
        topical_transformer.look_up.weights_init = Orthogonal()
        topical_transformer.transformer.weights_init = Orthogonal()
        topical_transformer.initialize()
        word_topical_embedding = cPickle.load(
            open(config['topical_embeddings'], 'rb'))
        np_word_topical_embedding = numpy.array(word_topical_embedding,
                                                dtype='float32')
        topical_transformer.look_up.W.set_value(np_word_topical_embedding)
        topical_transformer.look_up.W.tag.role = []

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]
        '''
        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))
        '''

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    on_unused_sources='warn',
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_topical_word = tensor.lmatrix('source_topical')

        # Get test set stream
        test_stream = get_dev_stream_with_topicalq(
            config['test_set'], config['src_vocab'], config['src_vocab_size'],
            config['topical_test_set'], config['topical_vocab'],
            config['topical_vocab_size'], config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        topic_embedding = topical_transformer.apply(source_topical_word)
        representation = encoder.apply(source_sentence,
                                       tensor.ones(source_sentence.shape))
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = representation[0, :,
                                           (representation.shape[2] / 2):]
        generated = decoder.generate(source_sentence,
                                     representation,
                                     tw_representation,
                                     topical_embedding=topic_embedding,
                                     content_embedding=content_embedding)

        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(pickle.load(
            open(config['trg_vocab'], 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            seq2 = line[1]
            input_ = numpy.tile(seq, (config['beam_size'], 1))
            input_topical = numpy.tile(seq2, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={source_sentence: input_,source_topical_word:input_topical},
                    max_length=10*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)
            '''
            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths
            '''
            #best = numpy.argsort(costs)[0]
            best = numpy.argsort(costs)[0:config['beam_size']]
            for b in best:
                try:
                    total_cost += costs[b]
                    trans_out = trans[b]

                    # convert idx to words
                    trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i +
                                                                         1))
                    trans_out = '<UNK>'

                print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
    elif mode == 'rerank':
        # Create Theano variables
        ftrans = open(config['val_set'] + '.scores.out', 'w')
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')

        config['src_data'] = config['val_set']
        config['trg_data'] = config['val_set_grndtruth']
        config['batch_size'] = 1
        config['sort_k_batches'] = 1
        test_stream = get_tr_stream_unsorted(**config)
        logger.info("Building sampling model")
        representations = encoder.apply(source_sentence, source_sentence_mask)
        costs = decoder.cost(representations, source_sentence_mask,
                             target_sentence, target_sentence_mask)
        logger.info("Loading the model..")
        model = Model(costs)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        costs_computer = function([
            source_sentence, source_sentence_mask, target_sentence,
            target_sentence_mask
        ], costs)
        iterator = test_stream.get_epoch_iterator()

        scores = []
        for i, (src, src_mask, trg, trg_mask) in enumerate(iterator):
            costs = costs_computer(*[src, src_mask, trg, trg_mask])
            cost = costs.sum()
            print(i, cost)
            scores.append(cost)
            ftrans.write(str(cost) + "\n")
        ftrans.close()
コード例 #5
0
ファイル: train.py プロジェクト: zlpmichelle/HRAN
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder')
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
        dev_source = tensor.lmatrix('dev_source')
        dev_target=tensor.lmatrix('dev_target')

        # Get training and development set streams
        tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_with_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (perplexity))
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init =decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init =decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])


        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))


        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        model_name=config['model_name'],
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if False:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq'],
                              n_best=3,
                              track_n_models=6))

        logger.info("Building perplexity validator")
        extensions.append(
                pplValidation(dev_source,dev_target, config=config,
                        model=costs_computer, data_stream=dev_stream,
                        model_name=config['model_name'],
                        every_n_batches=config['sampling_freq']))


        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([Scale(initial_learning_rate),
                                     StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()]))

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'ppl':
        # Create Theano variables
        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')

        # Get training and development set streams
        #tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_withContext_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (cost))


        logger.info("Loading the model..")
        model = Model(cost)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())
        logger.info("Started Validation: ")

        ts = dev_stream.get_epoch_iterator()
        total_cost = 0.0
        total_tokens=0.0
        #pbar = ProgressBar(max_value=len(ts)).start()#modified
        pbar = ProgressBar(max_value=10000).start();
        for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts):
            costs  = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask])
            cost = costs.sum()
            total_cost+=cost
            total_tokens+=trg_mask.sum()
            pbar.update(i + 1)
        total_cost/=total_tokens;
        pbar.finish()
        #dev_stream.reset()

        # run afterprocess
        # self.ap.main()
        total_cost=2**total_cost;
        print("Average validation cost: " + str(total_cost));
    elif mode == 'translate':

        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(
            cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}
        config['batch_size'] = 1

        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);
        generated = decoder.generate(sentence_representations_list,sentence_masks_list)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_withContext(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw[3].split()
            seqs=[];
            input_=[];
            input_mask=[];
            for j in range(config['ctx_num']+1):
                seqs.append(sutils._oov_to_unk(
                    line[2*j][0], config['src_vocab_size'], unk_idx))
                input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1)))
                input_.append(numpy.tile(seqs[j], (config['beam_size'], 1)))
            #v=costs_computer(input_[0]);
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3],
                                  context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0],
                                  context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1],
                                  context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]},
                    max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            b = numpy.argsort(costs)[0]
            #best=numpy.argsort(costs)[0:config['beam_size']];
            #for b in best:
            try:
                total_cost += costs[b]
                trans_out = trans[b]
                totalLen=4*len(line[0][0]);
                #weight = weights[b][:, :totalLen]
                weight=weights
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'
            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        ap = afterprocesser(config)
        ap.main()
コード例 #6
0
ファイル: sampling.py プロジェクト: oncebasun/seq2seq-theano
    def _evaluate_model(self, no_epochs_done):
        logger.info("Started Validation.")
        val_start_time = time.time()
        error_count = 0
        total_count = 0

        # Get target vocabulary
        if not self.trg_ivocab:
            # Load dictionaries and ensure special tokens exist
            self.src_vocab = _ensure_special_tokens(cPickle.load(
                open(self.config['src_vocab'])),
                                                    bos_idx=0,
                                                    eos_idx=2,
                                                    unk_idx=1)
            self.trg_vocab = _ensure_special_tokens(cPickle.load(
                open(self.config['trg_vocab'])),
                                                    bos_idx=0,
                                                    eos_idx=2,
                                                    unk_idx=1)
            self.src_ivocab = {v: k for k, v in self.src_vocab.items()}
            self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()}
            self.unk_idx = self.src_vocab[self.unk_sym]
            self.eos_idx = self.src_vocab[self.eos_sym]
        '''
        print('length data_stream:')
        print(len(self.data_stream))
        sys.exit(0)
        '''
        ana_file = io.open(
            self.validation_results_file + '_det', 'w',
            encoding='utf-8')  # this should always contain the last
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            """
            Load the sentence, retrieve the sample, write to file
            """
            #print(line)
            #sys.exit(0)
            seq = line[
                0]  #self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx)
            input_ = numpy.tile(seq, (1, 1))

            seq2 = line[2]
            target_ = numpy.tile(seq2, (1, 1))

            #print('target')
            #print(target_)

            #print('input')
            #print(input_)

            batch_size = input_.shape[0]
            #print('batch_size:')
            #print(batch_size)
            #sys.exit(0)

            #src_batch = anIter[main_loop.data_stream.mask_sources[0]]
            #input_ = src_batch[:, :]

            for j in range(batch_size):

                input_length = get_true_length(input_[j], self.src_vocab)
                target_length = get_true_length(target_[j], self.trg_vocab)

                inp = input_[j, :input_length]
                _1, outputs, _2, _3, costs, _4 = (self.sampling_fn(
                    inp[None, :]))
                outputs = outputs.flatten()
                sample_length = get_true_length(outputs, self.trg_vocab)

                # NOTE: There are two types of lost words. One is because the label has not been seen in the training data (rare).
                # The other reason is unknown. However, the word is lost at this point already.
                input_word = _idx_to_word(input_[j][:input_length],
                                          self.src_ivocab)
                #print(input_word)
                target_word = _idx_to_word(target_[j][:target_length],
                                           self.trg_ivocab)
                #print(target_word)
                predicted_word = _idx_to_word(outputs[:sample_length],
                                              self.trg_ivocab)
                #print(predicted_word)
                #sys.exit(0)
                # TODO: get target word
                ana_file.write(input_word + ' / ' + target_word + '\n')
                ana_file.write(predicted_word + '\n\n')
                if target_word != predicted_word:
                    error_count += 1
                total_count += 1

        new_accuracy = (total_count - error_count) * 1.0 / total_count

        self._save_model(new_accuracy, no_epochs_done)

        res_file = open(self.validation_results_file, 'a')
        res_file.write(
            str(no_epochs_done) + '\t' + str(new_accuracy) + '\t' +
            str(error_count) + '\t' + str(total_count) + '\t(Best: ' +
            str(self.best_epoch) + ')\n')
        res_file.close()

        ana_file.close()
        logger.info("Validation finished. Current accuracy on dev set: " +
                    str(new_accuracy))
コード例 #7
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    topical_transformer = topicalq_transformer(
        config['source_topic_vocab_size'], config['topical_embedding_dim'],
        config['enc_nhids'], config['topical_word_num'], config['batch_size'])
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      topicWord_size=config['trg_topic_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      topical_dim=config['topical_embedding_dim'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask');
        sampling_input = tensor.lmatrix('input')
        source_topical_word = tensor.lmatrix('source_topical')
        source_topical_mask = tensor.matrix('source_topical_mask')

        topic_embedding = topical_transformer.apply(source_topical_word)

        # Get training and development set streams
        tr_stream = get_tr_stream_with_topic_target(**config)
        #dev_stream = get_dev_tr_stream_with_topic_target(**config)

        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = representations[0, :,
                                            (representations.shape[2] / 2):]
        cost = decoder.cost(representations, source_sentence_mask,
                            tw_representation, source_topical_mask,
                            target_sentence, target_sentence_mask,
                            target_topic_sentence,
                            target_topic_binary_sentence, topic_embedding,
                            content_embedding)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'

        cg = ComputationGraph(cost)
        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (perplexity),
                                  on_unused_input='ignore')

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        topical_transformer.weights_init = IsotropicGaussian(
            config['weight_scale'])
        topical_transformer.biases_init = Constant(0)
        topical_transformer.push_allocation_config()
        #don't know whether the initialize is for
        topical_transformer.look_up.weights_init = Orthogonal()
        topical_transformer.transformer.weights_init = Orthogonal()
        topical_transformer.initialize()
        word_topical_embedding = cPickle.load(
            open(config['topical_embeddings'], 'rb'))
        np_word_topical_embedding = numpy.array(word_topical_embedding,
                                                dtype='float32')
        topical_transformer.look_up.W.set_value(np_word_topical_embedding)
        topical_transformer.look_up.W.tag.role = []

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # # Set up beam search and sampling computation graphs if necessary
        # if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        #     logger.info("Building sampling model")
        #     sampling_representation = encoder.apply(
        #         sampling_input, tensor.ones(sampling_input.shape))
        #     generated = decoder.generate(
        #         sampling_input, sampling_representation)
        #     search_model = Model(generated)
        #     _, samples = VariableFilter(
        #         bricks=[decoder.sequence_generator], name="outputs")(
        #             ComputationGraph(generated[1]))
        #
        # # Add sampling
        # if config['hook_samples'] >= 1:
        #     logger.info("Building sampler")
        #     extensions.append(
        #         Sampler(model=search_model, data_stream=tr_stream,
        #                 model_name=config['model_name'],
        #                 hook_samples=config['hook_samples'],
        #                 every_n_batches=config['sampling_freq'],
        #                 src_vocab_size=config['src_vocab_size']))
        #
        # # Add early stopping based on bleu
        # if False:
        #     logger.info("Building bleu validator")
        #     extensions.append(
        #         BleuValidator(sampling_input, samples=samples, config=config,
        #                       model=search_model, data_stream=dev_stream,
        #                       normalize=config['normalized_bleu'],
        #                       every_n_batches=config['bleu_val_freq'],
        #                       n_best=3,
        #                       track_n_models=6))
        #
        # logger.info("Building perplexity validator")
        # extensions.append(
        #         pplValidation( config=config,
        #                 model=costs_computer, data_stream=dev_stream,
        #                 model_name=config['model_name'],
        #                 every_n_batches=config['sampling_freq']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        Scale(initial_learning_rate),
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='ignore')

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')
        source_topical_word = tensor.lmatrix('source_topical')
        tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap')
        tw_vocab_overlap_matrix = cPickle.load(
            open(config['tw_vocab_overlap'], 'rb'))
        tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix,
                                              dtype='int32')
        #tw_vocab_overlap=shared(tw_vocab_overlap_matrix);

        topic_embedding = topical_transformer.apply(source_topical_word)

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(cPickle.load(
            open(config['trg_vocab'], 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        topic_embedding = topical_transformer.apply(source_topical_word)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = sampling_representation[0, :, (
            sampling_representation.shape[2] / 2):]
        generated = decoder.generate(sampling_input,
                                     sampling_representation,
                                     tw_representation,
                                     topical_embedding=topic_embedding,
                                     content_embedding=content_embedding)

        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_with_topicalq(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw.split()
            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            seq1 = line[1]
            input_topical = numpy.tile(seq1, (config['beam_size'], 1))
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix},
                    tw_vocab_overlap=tw_vocab_overlap_matrix,
                    max_length=3*len(seq), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]
                weight = weights[best][:, :len(trans_in)]
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        # ap = afterprocesser(config)
        # ap.main()

    elif mode == 'score':
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        source_topical_word = tensor.lmatrix('source_topical')

        topic_embedding = topical_transformer.apply(source_topical_word)
        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        costs = decoder.cost(representations, source_sentence_mask,
                             target_sentence, target_sentence_mask,
                             target_topic_sentence,
                             target_topic_binary_sentence, topic_embedding)

        config['batch_size'] = 1
        config['sort_k_batches'] = 1
        # Get test set stream
        test_stream = get_tr_stream_with_topic_target(**config)

        logger.info("Building sampling model")

        logger.info("Loading the model..")
        model = Model(costs)
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (costs),
                                  on_unused_input='ignore')

        iterator = test_stream.get_epoch_iterator()

        scores = []
        att_weights = []
        for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb,
                tb_mask) in enumerate(iterator):
            costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb])
            cost = costs.sum()
            print(i, cost)
            scores.append(cost)

        print(sum(scores) / 10007)
コード例 #8
0
ファイル: train.py プロジェクト: LynetteXing1991/TAJA-Seq2Seq
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2,config['topical_embedding_dim'])
    topical_transformer=topicalq_transformer(config['topical_vocab_size'],config['topical_embedding_dim'], config['enc_nhids'],config['topical_word_num'],config['batch_size']);

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
        source_topical_word=tensor.lmatrix('source_topical')
        source_topical_mask=tensor.matrix('source_topical_mask')

        # Get training and development set streams
        tr_stream = get_tr_stream_with_topicalq(**config)
        dev_stream = get_dev_stream_with_topicalq(**config)
        topic_embedding=topical_transformer.apply(source_topical_word);
        # Get cost of the model
        representation=encoder.apply(source_sentence, source_sentence_mask);
        tw_representation=topical_transformer.look_up.apply(source_topical_word.T);
        content_embedding=representation[0,:,(representation.shape[2]/2):];

        cost = decoder.cost(
            representation,source_sentence_mask,tw_representation,
            source_topical_mask, target_sentence, target_sentence_mask,topic_embedding,content_embedding);

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()
        topical_transformer.weights_init=IsotropicGaussian(
            config['weight_scale']);
        topical_transformer.biases_init=Constant(0);
        topical_transformer.push_allocation_config();#don't know whether the initialize is for
        topical_transformer.look_up.weights_init=Orthogonal();
        topical_transformer.transformer.weights_init=Orthogonal();
        topical_transformer.initialize();
        word_topical_embedding=cPickle.load(open(config['topical_embeddings'], 'rb'));
        np_word_topical_embedding=numpy.array(word_topical_embedding,dtype='float32');
        topical_transformer.look_up.W.set_value(np_word_topical_embedding);
        topical_transformer.look_up.W.tag.role=[];


        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]
        '''
        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))
        '''

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,on_unused_sources='warn',
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_topical_word=tensor.lmatrix('source_topical')

        # Get test set stream
        test_stream = get_dev_stream_with_topicalq(
            config['test_set'], config['src_vocab'],
            config['src_vocab_size'],config['topical_test_set'],config['topical_vocab'],config['topical_vocab_size'],config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        topic_embedding=topical_transformer.apply(source_topical_word);
        representation=encoder.apply(source_sentence, tensor.ones(source_sentence.shape));
        tw_representation=topical_transformer.look_up.apply(source_topical_word.T);
        content_embedding=representation[0,:,(representation.shape[2]/2):];
        generated = decoder.generate(source_sentence,representation, tw_representation,topical_embedding=topic_embedding,content_embedding=content_embedding);


        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(
            pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(
                line[0], config['src_vocab_size'], unk_idx)
            seq2 = line[1];
            input_ = numpy.tile(seq, (config['beam_size'], 1))
            input_topical=numpy.tile(seq2,(config['beam_size'],1))


            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={source_sentence: input_,source_topical_word:input_topical},
                    max_length=10*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)
            '''
            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths
            '''
            #best = numpy.argsort(costs)[0]
            best=numpy.argsort(costs)[0:config['beam_size']];
            for b in best:
                try:
                    total_cost += costs[b]
                    trans_out = trans[b]

                    # convert idx to words
                    trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

                except ValueError:
                    logger.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'

                print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
    elif mode == 'rerank':
        # Create Theano variables
        ftrans = open(config['val_set'] + '.scores.out', 'w')
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')

        config['src_data']=config['val_set']
        config['trg_data']=config['val_set_grndtruth']
        config['batch_size']=1;
        config['sort_k_batches']=1;
        test_stream=get_tr_stream_unsorted(**config);
        logger.info("Building sampling model")
        representations= encoder.apply(
            source_sentence,  source_sentence_mask)
        costs = decoder.cost(representations, source_sentence_mask,
            target_sentence, target_sentence_mask)
        logger.info("Loading the model..")
        model = Model(costs)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        costs_computer = function([source_sentence,source_sentence_mask,
                                  target_sentence,
                                  target_sentence_mask],costs)
        iterator = test_stream.get_epoch_iterator()

        scores = []
        for i, (src, src_mask, trg, trg_mask) in enumerate(iterator):
            costs = costs_computer(*[src, src_mask, trg, trg_mask])
            cost = costs.sum()
            print(i, cost)
            scores.append(cost)
            ftrans.write(str(cost)+"\n");
        ftrans.close();
コード例 #9
0
def predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, the_task, et_version, config, monitor_attention=False):
    resulting_predictions = {}
    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
    bin_model = main_loop.model
    sources = [inp.name for inp in bin_model.inputs]
    theinputs = [inp for inp in bin_model.inputs]
    decoder_output = [v for v in bin_model.variables if v.name == 'decoder_generate_output_0'][0]
    
    # The function to call for prediction.
    sampling_fn = bin_model.get_theano_function()

    # Load vocabularies and invert if necessary
    # WARNING: Source and target indices from data stream can be different
    # Load dictionaries and ensure special tokens exist.
    src_vocab = _ensure_special_tokens(
        cPickle.load(open(config['src_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    trg_vocab = _ensure_special_tokens(
        cPickle.load(open(config['trg_vocab'])),
        bos_idx=0, eos_idx=2, unk_idx=1)
    src_ivocab = {v: k for k, v in src_vocab.items()}
    trg_ivocab = {v: k for k, v in trg_vocab.items()}
    
    epoch_iter = data_to_predict_stream.get_epoch_iterator()

    counter = 0
 
    for anIter in epoch_iter:
      
      input_=anIter[0]
      
      if counter%500 == 0:
        logger.info(str(counter) + ' samples predicted')
      counter += 1
      
      inp = input_

      the_one = False
   
      _1, outputs, _2, glimpses, costs, output_probs = (sampling_fn([inp]))
     
      # Combining the probabilities for the single letters to get a total probability.
      prob_word = reduce(mul, output_probs, 1)[0]

      outputs = outputs.flatten()
      sample_length = get_true_length(outputs, trg_vocab)
       
      input_word = _idx_to_word(input_, src_ivocab)
      predicted_word = _idx_to_word(outputs[:sample_length], trg_ivocab)
 
      help_word_input = u''
      help_word_result = u''
      help_tag = u''

      resulting_predictions[u' '.join(input_word.split(' ')[1:len(input_word.split(' '))-1])] = u''.join(predicted_word.split(' ')[1:len(predicted_word.split(' '))-1])

    results_path = 'results/' + config['saveto'].split('task')[0] + 'task' + str(the_task) + '/' + lang + '/'
              
    cPickle.dump(resulting_predictions, open(results_path + 'Ens_' + str(use_ensemble) + '_intermediate_results.pkl', 'wb'))
    logger.info('Prediction done. Storing intermediate results to: ' + results_path + 'Ens_' + str(use_ensemble) + '_intermediate_results.pkl')