コード例 #1
0
 def get_data(self, state=None, request=None):
     """Get next data entry from ``active_source``, ignores args."""
     if request is not None:
         raise ValueError
     (s, t) = self.parallel_sources[self.active_idx].next()
     return (self.src_sparse_feat_map.words2dense(
         utils.oov_to_unk(s, self.src_vocab_size)),
             self.trg_sparse_feat_map.words2dense(
                 utils.oov_to_unk(t, self.trg_vocab_size)))
コード例 #2
0
ファイル: stream.py プロジェクト: ucam-smt/sgnmt
 def get_data(self, state=None, request=None):
     """Get next data entry from ``active_source``, ignores args."""
     if request is not None:
         raise ValueError
     (s,t) = self.parallel_sources[self.active_idx].next()
     return (self.src_sparse_feat_map.words2dense(
                                 utils.oov_to_unk(s, self.src_vocab_size)),
             self.trg_sparse_feat_map.words2dense(
                                 utils.oov_to_unk(t, self.trg_vocab_size)))
コード例 #3
0
ファイル: tf_t2t.py プロジェクト: strategist922/sgnmt
 def initialize(self, src_sentence):
     self.consumed = []
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size, self._t2t_unk_id)
     self.src_seg, self.src_pos = self._gen_seg_and_pos(self.src_sentence)
     self.history_sentences = [[]]
コード例 #4
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def initialize(self, src_sentence):
     self.consumed = []
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size, self._t2t_unk_id)
     self.src_seg, self.src_pos = self._gen_seg_and_pos(self.src_sentence)
     self.history_sentences = [[]]
コード例 #5
0
 def initialize(self, src_sentence):
     """Set src_sentence, compute fertilities for first src word."""
     self.fertility_history = []
     self.n_aligned_words = 0
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], self.src_vocab_size)
     self._update_scores()
コード例 #6
0
 def initialize(self, src_sentence):
     """Runs the encoder network to create the source annotations
     for the source sentence. If the cache is enabled, empty the
     cache.
     
     Args:
         src_sentence (list): List of word ids without <S> and </S>
                              which represent the source sentence.
     """
     self.contexts = None
     self.states = None 
     self.posterior_cache = SimpleTrie()
     self.states_cache = SimpleTrie()
     self.consumed = []
     seq = self.src_sparse_feat_map.words2dense(
                 utils.oov_to_unk(src_sentence,
                                  self.src_vocab_size)) + [self.src_eos]
     if self.src_sparse_feat_map.dim > 1: # sparse src feats
         input_ = np.transpose(np.tile(seq, (1, 1, 1)), (2,0,1))
     else: # word ids on the source side
         input_ = np.tile(seq, (1, 1))
     
     input_values={self.nmt_model.sampling_input: input_}
     self.contexts, self.states, _ = self.search_algorithm.compute_initial_states_and_contexts(
         input_values)
     self.attention_records = (1 + len(src_sentence)) * [0.0]
コード例 #7
0
ファイル: batch_decode.py プロジェクト: Jack44Wang/sgnmt
def load_sentences(path, _range, src_vocab_size):
    """Loads the source sentences to decode from the file system.
    
    Args:
        path (string): path to the plain text file with indexed
                       source sentences
        _range (string): Range argument
        src_vocab_size (int): Source language vocabulary size
    
    Returns:
        list. List of tuples, the first element is the sentence ID and
        the second element is a list of integers representing the
        sentence ending with EOS.
    """
    seqs = []
    seq_id = 1
    with open(path) as f:
        for line in f:
            seq = [int(w) for w in line.strip().split()]
            seqs.append((
                    seq_id,
                    utils.oov_to_unk(seq, src_vocab_size) + [utils.EOS_ID]))
            seq_id += 1
    if _range:
        try:
            if ":" in args.range:
                from_idx,to_idx = args.range.split(":")
            else:
                from_idx = int(args.range)
                to_idx = from_idx
            return seqs[int(from_idx)-1:int(to_idx)]
        except Exception as e:
            logging.fatal("Invalid value for --range: %s" % e)
    return seqs
コード例 #8
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def initialize(self, src_sentence):
     """Set src_sentence, compute fertilities for first src word."""
     self.fertility_history = []
     self.n_aligned_words = 0
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size)
     self._update_scores()
コード例 #9
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def predict_next(self):
     """Call the T2T model in self.mon_sess."""
     log_probs = self.mon_sess.run(self._log_probs,
         {self._inputs_var: self.src_sentence,
          self._targets_var: utils.oov_to_unk(
              self.consumed + [text_encoder.PAD_ID],
              self.trg_vocab_size,
              self._t2t_unk_id)})
     log_probs[text_encoder.PAD_ID] = utils.NEG_INF
     return log_probs
コード例 #10
0
ファイル: tf_t2t.py プロジェクト: strategist922/sgnmt
 def predict_next(self):
     """Call the T2T model in self.mon_sess."""
     log_probs = self.mon_sess.run(self._log_probs,
         {self._inputs_var: self.src_sentence,
          self._targets_var: utils.oov_to_unk(
              self.consumed + [text_encoder.PAD_ID],
              self.trg_vocab_size,
              self._t2t_unk_id)})
     log_probs[text_encoder.PAD_ID] = utils.NEG_INF
     return log_probs
コード例 #11
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def initialize(self, src_sentence):
     """Set src_sentence, reset consumed."""
     if self.initial_trg_sentences is None:
         self.trg_sentence = [text_encoder.EOS_ID]
     else:
         self.trg_sentence = self.initial_trg_sentences[self.current_sen_id]
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size, self._t2t_unk_id)
     self.cache = SimpleTrie()
     self._update_cur_score()
     logging.debug("Initial score: %f" % self.cur_score)
コード例 #12
0
ファイル: tf_t2t.py プロジェクト: strategist922/sgnmt
 def initialize(self, src_sentence):
     """Set src_sentence, reset consumed."""
     if self.initial_trg_sentences is None:
         self.trg_sentence = [text_encoder.EOS_ID]
     else:
         self.trg_sentence = self.initial_trg_sentences[self.current_sen_id]
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size, self._t2t_unk_id)
     self.cache = SimpleTrie()
     self._update_cur_score()
     logging.debug("Initial score: %f" % self.cur_score)
コード例 #13
0
 def initialize_marg(self):
     """Initialize source tensors, reset consumed."""
     src_tokens = torch.LongTensor(
         [utils.oov_to_unk([utils.EOS_ID], self.src_vocab_size)])
     src_lengths = torch.LongTensor([1])
     if self.use_cuda:
         src_tokens = src_tokens.cuda()
         src_lengths = src_lengths.cuda()
     self.marg_encoder_outs = self.marg_model.forward_encoder({
         'src_tokens':
         src_tokens,
         'src_lengths':
         src_lengths
     })
     # Reset incremental states
     for model in self.marg_models:
         self.marg_model.incremental_states[model] = {}
コード例 #14
0
ファイル: vanilla_decoder.py プロジェクト: ucam-smt/sgnmt
 def decode(self, src_sentence):
     """Decodes a single source sentence with the original blocks
     beam search decoder. Does not use predictors. Note that the
     score breakdowns in returned hypotheses are only on the 
     sentence level, not on the word level. For finer grained NMT
     scores you need to use the nmt predictor. ``src_sentence`` is a
     list of source word ids representing the source sentence without
     <S> or </S> symbols. As blocks expects to see </S>, this method
     adds it automatically.
     
     Args:
         src_sentence (list): List of source word ids without <S> or
                              </S> which make up the source sentence
     
     Returns:
         list. A list of ``Hypothesis`` instances ordered by their
         score.
     """
     seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
             src_sentence,
             self.config['src_vocab_size'])) + [self.src_eos]
     if self.src_sparse_feat_map.dim > 1: # sparse src feats
         input_ = np.transpose(
                         np.tile(seq, (self.config['beam_size'], 1, 1)),
                         (2,0,1))
     else: # word ids on the source side
         input_ = np.tile(seq, (self.config['beam_size'], 1))
     trans, costs = self.beam_search.search(
                 input_values={self.nmt_model.sampling_input: input_},
                 max_length=3*len(src_sentence),
                 eol_symbol=utils.EOS_ID,
                 ignore_first_eol=True)
     hypos = []
     max_len = 0
     for idx in xrange(len(trans)):
         max_len = max(max_len, len(trans[idx]))
         hypo = Hypothesis(trans[idx], -costs[idx])
         hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]]
         hypo.score_breakdown[0] = [(-costs[idx],1.0)]
         hypos.append(hypo)
     self.apply_predictors_count = max_len * self.config['beam_size']
     return hypos
 def decode(self, src_sentence):
     """Decodes a single source sentence with the original blocks
     beam search decoder. Does not use predictors. Note that the
     score breakdowns in returned hypotheses are only on the 
     sentence level, not on the word level. For finer grained NMT
     scores you need to use the nmt predictor. ``src_sentence`` is a
     list of source word ids representing the source sentence without
     <S> or </S> symbols. As blocks expects to see </S>, this method
     adds it automatically.
     
     Args:
         src_sentence (list): List of source word ids without <S> or
                              </S> which make up the source sentence
     
     Returns:
         list. A list of ``Hypothesis`` instances ordered by their
         score.
     """
     seq = self.src_sparse_feat_map.words2dense(
         utils.oov_to_unk(src_sentence,
                          self.config['src_vocab_size'])) + [self.src_eos]
     if self.src_sparse_feat_map.dim > 1:  # sparse src feats
         input_ = np.transpose(
             np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1))
     else:  # word ids on the source side
         input_ = np.tile(seq, (self.config['beam_size'], 1))
     trans, costs = self.beam_search.search(
         input_values={self.nmt_model.sampling_input: input_},
         max_length=3 * len(src_sentence),
         eol_symbol=utils.EOS_ID,
         ignore_first_eol=True)
     hypos = []
     max_len = 0
     for idx in xrange(len(trans)):
         max_len = max(max_len, len(trans[idx]))
         hypo = Hypothesis(trans[idx], -costs[idx])
         hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
         hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
         hypos.append(hypo)
     self.apply_predictors_count = max_len * self.config['beam_size']
     return hypos
コード例 #16
0
 def initialize(self, src_sentence):
     """Initialize source tensors, reset consumed."""
     self.consumed = []
     src_tokens = torch.LongTensor([
         utils.oov_to_unk(src_sentence + [utils.EOS_ID],
                          self.src_vocab_size)
     ])
     src_lengths = torch.LongTensor([len(src_sentence) + 1])
     if self.use_cuda:
         src_tokens = src_tokens.cuda()
         src_lengths = src_lengths.cuda()
     self.encoder_outs = self.model.forward_encoder({
         'src_tokens':
         src_tokens,
         'src_lengths':
         src_lengths
     })
     self.consumed = [utils.GO_ID or utils.EOS_ID]
     # Reset incremental states
     for model in self.models:
         self.model.incremental_states[model] = {}
    def decode(self, src_sentence):
        """This is a generalization to NMT ensembles of 
        ``BeamSearch.search``.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        for search in self.beam_searches:
            if not search.compiled:
                search.compile()
        seq = self.src_sparse_feat_map.words2dense(
            utils.oov_to_unk(src_sentence,
                             self.src_vocab_size)) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1:  # sparse src feats
            input_ = np.transpose(np.tile(seq, (self.beam_size, 1, 1)),
                                  (2, 0, 1))
        else:  # word ids on the source side
            input_ = np.tile(seq, (self.beam_size, 1))

        contexts_and_states = []
        for sys_idx in xrange(self.n_networks):
            contexts, states, _ = \
                self.beam_searches[sys_idx].compute_initial_states_and_contexts(
                            {self.nmt_models[sys_idx].sampling_input: input_})
            contexts_and_states.append(
                (contexts, states, self.beam_searches[sys_idx]))

        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = states['outputs'][None, :]
        all_masks = np.ones_like(all_outputs, dtype=config.floatX)
        all_costs = np.zeros_like(all_outputs, dtype=config.floatX)

        for i in range(3 * len(src_sentence)):
            if all_masks[-1].sum() == 0:
                break
            logprobs_lst = []
            for contexts, states, search in contexts_and_states:
                logprobs_lst.append(search.compute_logprobs(contexts, states))

            logprobs = np.sum(logprobs_lst, axis=0)
            next_costs = (all_costs[-1, :, None] +
                          logprobs * all_masks[-1, :, None])
            (finished, ) = np.where(all_masks[-1] == 0)
            next_costs[finished, :utils.EOS_ID] = np.inf
            next_costs[finished, utils.EOS_ID + 1:] = np.inf

            # The `i == 0` is required because at the first step the beam
            # size is effectively only 1.
            (indexes, outputs), chosen_costs = BeamSearch._smallest(
                next_costs, self.beam_size, only_first_row=i == 0)

            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]

            # Rearrange everything
            for contexts, states, search in contexts_and_states:
                for name in states:
                    states[name] = states[name][indexes]
                states.update(
                    search.compute_next_states(contexts, states, outputs))

            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != utils.EOS_ID
            if i == 0:
                mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])

        all_outputs = all_outputs[1:]
        all_masks = all_masks[:-1]
        all_costs = all_costs[1:] - all_costs[:-1]
        result = all_outputs, all_masks, all_costs
        trans, costs = BeamSearch.result_to_lists(result)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.beam_size
        return hypos
コード例 #18
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def __init__(self,
              src_vocab_size,
              trg_vocab_size,
              model_name,
              problem_name,
              hparams_set_name,
              trg_test_file,
              beam_size,
              t2t_usr_dir,
              checkpoint_dir,
              t2t_unk_id=None,
              n_cpu_threads=-1,
              max_terminal_id=-1,
              pop_id=-1):
     """Creates a new edit T2T predictor. This constructor is
     similar to the constructor of T2TPredictor but creates a
     different computation graph which retrieves scores at each
     target position, not only the last one.
     
     Args:
         src_vocab_size (int): Source vocabulary size.
         trg_vocab_size (int): Target vocabulary size.
         model_name (string): T2T model name.
         problem_name (string): T2T problem name.
         hparams_set_name (string): T2T hparams set name.
         trg_test_file (string): Path to a plain text file with
             initial target sentences. Can be empty.
         beam_size (int): Determines how many substitutions and
             insertions are considered at each position.
         t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
         checkpoint_dir (string): Path to the T2T checkpoint 
                                  directory. The predictor will load
                                  the top most checkpoint in the 
                                  `checkpoints` file.
         t2t_unk_id (int): If set, use this ID to get UNK scores. If
                           None, UNK is always scored with -inf.
         n_cpu_threads (int): Number of TensorFlow CPU threads.
         max_terminal_id (int): If positive, maximum terminal ID. Needs to
             be set for syntax-based T2T models.
         pop_id (int): If positive, ID of the POP or closing bracket symbol.
             Needs to be set for syntax-based T2T models.
     """
     super(EditT2TPredictor, self).__init__(t2t_usr_dir, 
                                            checkpoint_dir, 
                                            src_vocab_size,
                                            trg_vocab_size,
                                            t2t_unk_id, 
                                            n_cpu_threads,
                                            max_terminal_id,
                                            pop_id)
     if not model_name or not problem_name or not hparams_set_name:
         logging.fatal(
             "Please specify t2t_model, t2t_problem, and t2t_hparams_set!")
         raise AttributeError
     if trg_vocab_size >= EditT2TPredictor.POS_FACTOR:
         logging.fatal("Target vocabulary size (%d) must be less than %d!"
                       % (trg_vocab_size, EditT2TPredictor.POS_FACTOR))
         raise AttributeError
     self.beam_size = max(1, beam_size // 10) + 1
     self.batch_size = 2048 # TODO(fstahlberg): Move to config
     self.initial_trg_sentences = None
     if trg_test_file: 
         self.initial_trg_sentences = []
         with open(trg_test_file) as f:
             for line in f:
                 self.initial_trg_sentences.append(utils.oov_to_unk(
                    [int(w) for w in line.strip().split()] + [utils.EOS_ID],
                    self.trg_vocab_size, self._t2t_unk_id))
     predictor_graph = tf.Graph()
     with predictor_graph.as_default() as g:
         hparams = trainer_lib.create_hparams(hparams_set_name)
         self._add_problem_hparams(hparams, problem_name)
         translate_model = registry.model(model_name)(
             hparams, tf.estimator.ModeKeys.EVAL)
         self._inputs_var = tf.placeholder(dtype=tf.int32, shape=[None],
                                           name="sgnmt_inputs")
         self._targets_var = tf.placeholder(dtype=tf.int32, shape=[None, None], 
                                            name="sgnmt_targets")
         shp = tf.shape(self._targets_var)
         bsz = shp[0]
         inputs = tf.tile(tf.expand_dims(self._inputs_var, 0), [bsz, 1])
         features = {"inputs": expand_input_dims_for_t2t(inputs,
                                                         batched=True), 
                     "targets": expand_input_dims_for_t2t(self._targets_var,
                                                          batched=True)}
         translate_model.prepare_features_for_infer(features)
         translate_model._fill_problem_hparams_features(features)
         logits, _ = translate_model(features)
         logits = tf.squeeze(logits, [2, 3])
         self._log_probs = log_prob_from_logits(logits)
         diag_logits = gather_2d(logits, tf.expand_dims(tf.range(bsz), 1))
         self._diag_log_probs = log_prob_from_logits(diag_logits)
         no_pad = tf.cast(tf.not_equal(
             self._targets_var, text_encoder.PAD_ID), tf.float32)
         flat_bsz = shp[0] * shp[1]
         word_scores = gather_2d(
             tf.reshape(self._log_probs, [flat_bsz, -1]),
             tf.reshape(self._targets_var, [flat_bsz, 1]))
         word_scores = tf.reshape(word_scores, (shp[0], shp[1])) * no_pad
         self._sentence_scores = tf.reduce_sum(word_scores, -1)
         self.mon_sess = self.create_session()
コード例 #19
0
 def initialize(self, src_sentence):
     """Set src_sentence, reset consumed."""
     self.consumed = []
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], self.src_vocab_size,
         self._t2t_unk_id)
コード例 #20
0
    def _evaluate_model(self):
        """Evaluate model and store checkpoints. """
        logging.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0
        if self.verbose:
            ftrans = open(self.config['val_set_out'], 'w')
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            seq = self.src_sparse_feat_map.words2dense(
                utils.oov_to_unk(line[0], self.config['src_vocab_size']))
            if self.src_sparse_feat_map.dim > 1:  # sparse src feats
                input_ = numpy.transpose(
                    numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                    (2, 0, 1))
            else:  # word ids on the source side
                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans = trans[best]
                    if trans and trans[-1] == utils.EOS_ID:
                        trans = trans[:-1]
                    trans_out = ' '.join([str(w) for w in trans])
                except ValueError:
                    logging.info(
                        "Can NOT find a translation for line: {}".format(i +
                                                                         1))
                    trans_out = '<UNK>'
                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    if self.verbose:
                        print(trans_out, file=ftrans)
            if i != 0 and i % 100 == 0:
                logging.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()
        logging.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        if self.verbose:
            ftrans.close()
        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logging.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logging.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None
        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logging.info(bleu_score)
        mb_subprocess.terminate()
        return bleu_score
コード例 #21
0
ファイル: tf_t2t.py プロジェクト: strategist922/sgnmt
 def __init__(self,
              src_vocab_size,
              trg_vocab_size,
              model_name,
              problem_name,
              hparams_set_name,
              trg_test_file,
              beam_size,
              t2t_usr_dir,
              checkpoint_dir,
              t2t_unk_id=None,
              n_cpu_threads=-1,
              max_terminal_id=-1,
              pop_id=-1):
     """Creates a new edit T2T predictor. This constructor is
     similar to the constructor of T2TPredictor but creates a
     different computation graph which retrieves scores at each
     target position, not only the last one.
     
     Args:
         src_vocab_size (int): Source vocabulary size.
         trg_vocab_size (int): Target vocabulary size.
         model_name (string): T2T model name.
         problem_name (string): T2T problem name.
         hparams_set_name (string): T2T hparams set name.
         trg_test_file (string): Path to a plain text file with
             initial target sentences. Can be empty.
         beam_size (int): Determines how many substitutions and
             insertions are considered at each position.
         t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
         checkpoint_dir (string): Path to the T2T checkpoint 
                                  directory. The predictor will load
                                  the top most checkpoint in the 
                                  `checkpoints` file.
         t2t_unk_id (int): If set, use this ID to get UNK scores. If
                           None, UNK is always scored with -inf.
         n_cpu_threads (int): Number of TensorFlow CPU threads.
         max_terminal_id (int): If positive, maximum terminal ID. Needs to
             be set for syntax-based T2T models.
         pop_id (int): If positive, ID of the POP or closing bracket symbol.
             Needs to be set for syntax-based T2T models.
     """
     super(EditT2TPredictor, self).__init__(t2t_usr_dir, 
                                            checkpoint_dir, 
                                            src_vocab_size,
                                            trg_vocab_size,
                                            t2t_unk_id, 
                                            n_cpu_threads,
                                            max_terminal_id,
                                            pop_id)
     if not model_name or not problem_name or not hparams_set_name:
         logging.fatal(
             "Please specify t2t_model, t2t_problem, and t2t_hparams_set!")
         raise AttributeError
     if trg_vocab_size >= EditT2TPredictor.POS_FACTOR:
         logging.fatal("Target vocabulary size (%d) must be less than %d!"
                       % (trg_vocab_size, EditT2TPredictor.POS_FACTOR))
         raise AttributeError
     self.beam_size = max(1, beam_size // 10) + 1
     self.batch_size = 2048 # TODO(fstahlberg): Move to config
     self.initial_trg_sentences = None
     if trg_test_file: 
         self.initial_trg_sentences = []
         with open(trg_test_file) as f:
             for line in f:
                 self.initial_trg_sentences.append(utils.oov_to_unk(
                    [int(w) for w in line.strip().split()] + [utils.EOS_ID],
                    self.trg_vocab_size, self._t2t_unk_id))
     predictor_graph = tf.Graph()
     with predictor_graph.as_default() as g:
         hparams = trainer_lib.create_hparams(hparams_set_name)
         self._add_problem_hparams(hparams, problem_name)
         translate_model = registry.model(model_name)(
             hparams, tf.estimator.ModeKeys.EVAL)
         self._inputs_var = tf.placeholder(dtype=tf.int32, shape=[None],
                                           name="sgnmt_inputs")
         self._targets_var = tf.placeholder(dtype=tf.int32, shape=[None, None], 
                                            name="sgnmt_targets")
         shp = tf.shape(self._targets_var)
         bsz = shp[0]
         inputs = tf.tile(tf.expand_dims(self._inputs_var, 0), [bsz, 1])
         features = {"inputs": expand_input_dims_for_t2t(inputs,
                                                         batched=True), 
                     "targets": expand_input_dims_for_t2t(self._targets_var,
                                                          batched=True)}
         translate_model.prepare_features_for_infer(features)
         translate_model._fill_problem_hparams_features(features)
         logits, _ = translate_model(features)
         logits = tf.squeeze(logits, [2, 3])
         self._log_probs = log_prob_from_logits(logits)
         diag_logits = gather_2d(logits, tf.expand_dims(tf.range(bsz), 1))
         self._diag_log_probs = log_prob_from_logits(diag_logits)
         no_pad = tf.cast(tf.not_equal(
             self._targets_var, text_encoder.PAD_ID), tf.float32)
         flat_bsz = shp[0] * shp[1]
         word_scores = gather_2d(
             tf.reshape(self._log_probs, [flat_bsz, -1]),
             tf.reshape(self._targets_var, [flat_bsz, 1]))
         word_scores = tf.reshape(word_scores, (shp[0], shp[1])) * no_pad
         self._sentence_scores = tf.reduce_sum(word_scores, -1)
         self.mon_sess = self.create_session()
コード例 #22
0
ファイル: sampling.py プロジェクト: ucam-smt/sgnmt
    def _evaluate_model(self):
        """Evaluate model and store checkpoints. """
        logging.info("Started Validation: ")
        val_start_time = time.time()
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0
        ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w')
        for i, line in enumerate(self.data_stream.get_epoch_iterator()):
            seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                line[0], self.config['src_vocab_size']))
            if self.src_sparse_feat_map.dim > 1: # sparse src feats
                input_ = numpy.transpose(
                             numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                             (2,0,1))
            else: # word ids on the source side
                input_ = numpy.tile(seq, (self.config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                self.beam_search.search(
                    input_values={self.source_sentence: input_},
                    max_length=3*len(seq), eol_symbol=utils.EOS_ID,
                    ignore_first_eol=True)
            # normalize costs according to the sequence lengths
            if self.normalize:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            nbest_idx = numpy.argsort(costs)[:self.n_best]
            for j, best in enumerate(nbest_idx):
                try:
                    total_cost += costs[best]
                    trans = trans[best]
                    if trans and trans[-1] == utils.EOS_ID:
                        trans = trans[:-1]
                    trans_out = ' '.join([str(w) for w in trans])
                except ValueError:
                    logging.info(
                        "Can NOT find a translation for line: {}".format(i+1))
                    trans_out = '<UNK>'
                if j == 0:
                    # Write to subprocess and file if it exists
                    print(trans_out, file=mb_subprocess.stdin)
                    print(trans_out, file=ftrans)
            if i != 0 and i % 100 == 0:
                logging.info(
                    "Translated {} lines of validation set...".format(i))

            mb_subprocess.stdin.flush()
        logging.info("Total cost of the validation: {}".format(total_cost))
        self.data_stream.reset()
        ftrans.close()
        # send end of file, read output.
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        logging.info(stdout)
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        logging.info("Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.))
        assert out_parse is not None
        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        logging.info(bleu_score)
        mb_subprocess.terminate()
        return bleu_score
コード例 #23
0
ファイル: tf_t2t.py プロジェクト: ucam-smt/sgnmt
 def initialize(self, src_sentence):
     """Set src_sentence, reset consumed."""
     self.consumed = []
     self.src_sentence = utils.oov_to_unk(
         src_sentence + [text_encoder.EOS_ID], 
         self.src_vocab_size, self._t2t_unk_id)
コード例 #24
0
ファイル: vanilla_decoder.py プロジェクト: ucam-smt/sgnmt
    def decode(self, src_sentence):
        """This is a generalization to NMT ensembles of 
        ``BeamSearch.search``.
        
        Args:
            src_sentence (list): List of source word ids without <S> or
                                 </S> which make up the source sentence
        
        Returns:
            list. A list of ``Hypothesis`` instances ordered by their
            score.
        """
        for search in self.beam_searches:
            if not search.compiled:
                search.compile()
        seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                src_sentence,
                self.src_vocab_size)) + [self.src_eos]
        if self.src_sparse_feat_map.dim > 1: # sparse src feats
            input_ = np.transpose(
                            np.tile(seq, (self.beam_size, 1, 1)),
                            (2,0,1))
        else: # word ids on the source side
            input_ = np.tile(seq, (self.beam_size, 1))

        contexts_and_states = []
        for sys_idx in xrange(self.n_networks):
            contexts, states, _ = \
                self.beam_searches[sys_idx].compute_initial_states_and_contexts(
                            {self.nmt_models[sys_idx].sampling_input: input_})
            contexts_and_states.append((contexts, 
                                        states, 
                                        self.beam_searches[sys_idx]))

        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = states['outputs'][None, :]
        all_masks = np.ones_like(all_outputs, dtype=config.floatX)
        all_costs = np.zeros_like(all_outputs, dtype=config.floatX)

        for i in range(3*len(src_sentence)):
            if all_masks[-1].sum() == 0:
                break
            logprobs_lst = []
            for contexts, states, search in contexts_and_states:
                logprobs_lst.append(search.compute_logprobs(contexts, states))
            
            logprobs = np.sum(logprobs_lst, axis=0)
            next_costs = (all_costs[-1, :, None] +
                          logprobs * all_masks[-1, :, None])
            (finished,) = np.where(all_masks[-1] == 0)
            next_costs[finished, :utils.EOS_ID] = np.inf
            next_costs[finished, utils.EOS_ID + 1:] = np.inf

            # The `i == 0` is required because at the first step the beam
            # size is effectively only 1.
            (indexes, outputs), chosen_costs = BeamSearch._smallest(
                next_costs, self.beam_size, only_first_row=i == 0)

            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]
            
            # Rearrange everything
            for contexts, states, search in contexts_and_states:
                for name in states:
                    states[name] = states[name][indexes]
                states.update(search.compute_next_states(contexts, 
                                                         states, 
                                                         outputs))
            
            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != utils.EOS_ID
            if i == 0:
                mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])

        all_outputs = all_outputs[1:]
        all_masks = all_masks[:-1]
        all_costs = all_costs[1:] - all_costs[:-1]
        result = all_outputs, all_masks, all_costs
        trans, costs = BeamSearch.result_to_lists(result)
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx],1.0)]
            hypos.append(hypo)
        self.apply_predictors_count = max_len * self.beam_size
        return hypos
コード例 #25
0
 def _evaluate_model(self):
     """Evaluate model and store checkpoints. """
     logging.info("Started Validation: ")
     val_start_time = time.time()
     total_cost = 0.0
     if self.verbose:
         ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8')
     for i, line in enumerate(self.data_stream.get_epoch_iterator()):
         seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                                                                 line[0], self.config['src_vocab_size']))
         if self.src_sparse_feat_map.dim > 1: # sparse src feats
             input_ = numpy.transpose(
                                      numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                                      (2,0,1))
         else: # word ids on the source side
             input_ = numpy.tile(seq, (self.config['beam_size'], 1))
         # draw sample, checking to ensure we don't get an empty string back
         trans, costs = \
             self.beam_search.search(
                                     input_values={self.source_sentence: input_},
                                     max_length=3*len(line[0]), eol_symbol=utils.EOS_ID,
                                     ignore_first_eol=True)
                 #            if i < 10:
                 #                logging.info("ID: {}".format(i))
                 #                logging.info("Source: {}".format(line[0]))
                 #                for k, tran in enumerate(trans):
                 #                    logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap)))
                 #                    logging.info("{}".format(costs[k]))
                 # normalize costs according to the sequence lengths
         if self.normalize:
             lengths = numpy.array([len(s) for s in trans])
             costs = costs / lengths
                             
         nbest_idx = numpy.argsort(costs)[:self.n_best]
         for j, best in enumerate(nbest_idx):
             try:
                 total_cost += costs[best]
                 trans = trans[best]
                 if trans and trans[-1] == utils.EOS_ID:
                     trans = trans[:-1]
                 trans_out = ' '.join([str(w) for w in trans])
             except ValueError:
                 logging.info(
                          "Can NOT find a translation for line: {}".format(i+1))
                 trans_out = '<UNK>'
                 trans = 0
             if j == 0:
                 # Write to subprocess and file if it exists
                 ##print(trans_out, file=mb_subprocess.stdin)
                 if self.verbose:
                     print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans)
         if i != 0 and i % 100 == 0:
             logging.info(
                 "Translated {} lines of validation set...".format(i))
                                     
     logging.info("Total cost of the validation: {}".format(total_cost))
     self.data_stream.reset()
     if self.verbose:
         ftrans.close()
     logging.info("Validation Took: {} minutes".format(
                                                        float(time.time() - val_start_time) / 60.))
     logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']))
     bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8"))
     self.val_bleu_curve.append(bleu_score)
     logging.info(bleu_score)
     return bleu_score