def get_data(self, state=None, request=None): """Get next data entry from ``active_source``, ignores args.""" if request is not None: raise ValueError (s, t) = self.parallel_sources[self.active_idx].next() return (self.src_sparse_feat_map.words2dense( utils.oov_to_unk(s, self.src_vocab_size)), self.trg_sparse_feat_map.words2dense( utils.oov_to_unk(t, self.trg_vocab_size)))
def get_data(self, state=None, request=None): """Get next data entry from ``active_source``, ignores args.""" if request is not None: raise ValueError (s,t) = self.parallel_sources[self.active_idx].next() return (self.src_sparse_feat_map.words2dense( utils.oov_to_unk(s, self.src_vocab_size)), self.trg_sparse_feat_map.words2dense( utils.oov_to_unk(t, self.trg_vocab_size)))
def initialize(self, src_sentence): self.consumed = [] self.src_sentence = utils.oov_to_unk( src_sentence + [text_encoder.EOS_ID], self.src_vocab_size, self._t2t_unk_id) self.src_seg, self.src_pos = self._gen_seg_and_pos(self.src_sentence) self.history_sentences = [[]]
def initialize(self, src_sentence): """Set src_sentence, compute fertilities for first src word.""" self.fertility_history = [] self.n_aligned_words = 0 self.src_sentence = utils.oov_to_unk( src_sentence + [text_encoder.EOS_ID], self.src_vocab_size) self._update_scores()
def initialize(self, src_sentence): """Runs the encoder network to create the source annotations for the source sentence. If the cache is enabled, empty the cache. Args: src_sentence (list): List of word ids without <S> and </S> which represent the source sentence. """ self.contexts = None self.states = None self.posterior_cache = SimpleTrie() self.states_cache = SimpleTrie() self.consumed = [] seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.src_vocab_size)) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose(np.tile(seq, (1, 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (1, 1)) input_values={self.nmt_model.sampling_input: input_} self.contexts, self.states, _ = self.search_algorithm.compute_initial_states_and_contexts( input_values) self.attention_records = (1 + len(src_sentence)) * [0.0]
def load_sentences(path, _range, src_vocab_size): """Loads the source sentences to decode from the file system. Args: path (string): path to the plain text file with indexed source sentences _range (string): Range argument src_vocab_size (int): Source language vocabulary size Returns: list. List of tuples, the first element is the sentence ID and the second element is a list of integers representing the sentence ending with EOS. """ seqs = [] seq_id = 1 with open(path) as f: for line in f: seq = [int(w) for w in line.strip().split()] seqs.append(( seq_id, utils.oov_to_unk(seq, src_vocab_size) + [utils.EOS_ID])) seq_id += 1 if _range: try: if ":" in args.range: from_idx,to_idx = args.range.split(":") else: from_idx = int(args.range) to_idx = from_idx return seqs[int(from_idx)-1:int(to_idx)] except Exception as e: logging.fatal("Invalid value for --range: %s" % e) return seqs
def predict_next(self): """Call the T2T model in self.mon_sess.""" log_probs = self.mon_sess.run(self._log_probs, {self._inputs_var: self.src_sentence, self._targets_var: utils.oov_to_unk( self.consumed + [text_encoder.PAD_ID], self.trg_vocab_size, self._t2t_unk_id)}) log_probs[text_encoder.PAD_ID] = utils.NEG_INF return log_probs
def initialize(self, src_sentence): """Set src_sentence, reset consumed.""" if self.initial_trg_sentences is None: self.trg_sentence = [text_encoder.EOS_ID] else: self.trg_sentence = self.initial_trg_sentences[self.current_sen_id] self.src_sentence = utils.oov_to_unk( src_sentence + [text_encoder.EOS_ID], self.src_vocab_size, self._t2t_unk_id) self.cache = SimpleTrie() self._update_cur_score() logging.debug("Initial score: %f" % self.cur_score)
def initialize_marg(self): """Initialize source tensors, reset consumed.""" src_tokens = torch.LongTensor( [utils.oov_to_unk([utils.EOS_ID], self.src_vocab_size)]) src_lengths = torch.LongTensor([1]) if self.use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() self.marg_encoder_outs = self.marg_model.forward_encoder({ 'src_tokens': src_tokens, 'src_lengths': src_lengths }) # Reset incremental states for model in self.marg_models: self.marg_model.incremental_states[model] = {}
def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3*len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]] hypo.score_breakdown[0] = [(-costs[idx],1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos
def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3 * len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos
def initialize(self, src_sentence): """Initialize source tensors, reset consumed.""" self.consumed = [] src_tokens = torch.LongTensor([ utils.oov_to_unk(src_sentence + [utils.EOS_ID], self.src_vocab_size) ]) src_lengths = torch.LongTensor([len(src_sentence) + 1]) if self.use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() self.encoder_outs = self.model.forward_encoder({ 'src_tokens': src_tokens, 'src_lengths': src_lengths }) self.consumed = [utils.GO_ID or utils.EOS_ID] # Reset incremental states for model in self.models: self.model.incremental_states[model] = {}
def decode(self, src_sentence): """This is a generalization to NMT ensembles of ``BeamSearch.search``. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ for search in self.beam_searches: if not search.compiled: search.compile() seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.src_vocab_size)) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose(np.tile(seq, (self.beam_size, 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = np.tile(seq, (self.beam_size, 1)) contexts_and_states = [] for sys_idx in xrange(self.n_networks): contexts, states, _ = \ self.beam_searches[sys_idx].compute_initial_states_and_contexts( {self.nmt_models[sys_idx].sampling_input: input_}) contexts_and_states.append( (contexts, states, self.beam_searches[sys_idx])) # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = states['outputs'][None, :] all_masks = np.ones_like(all_outputs, dtype=config.floatX) all_costs = np.zeros_like(all_outputs, dtype=config.floatX) for i in range(3 * len(src_sentence)): if all_masks[-1].sum() == 0: break logprobs_lst = [] for contexts, states, search in contexts_and_states: logprobs_lst.append(search.compute_logprobs(contexts, states)) logprobs = np.sum(logprobs_lst, axis=0) next_costs = (all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]) (finished, ) = np.where(all_masks[-1] == 0) next_costs[finished, :utils.EOS_ID] = np.inf next_costs[finished, utils.EOS_ID + 1:] = np.inf # The `i == 0` is required because at the first step the beam # size is effectively only 1. (indexes, outputs), chosen_costs = BeamSearch._smallest( next_costs, self.beam_size, only_first_row=i == 0) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Rearrange everything for contexts, states, search in contexts_and_states: for name in states: states[name] = states[name][indexes] states.update( search.compute_next_states(contexts, states, outputs)) all_outputs = np.vstack([all_outputs, outputs[None, :]]) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != utils.EOS_ID if i == 0: mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) all_outputs = all_outputs[1:] all_masks = all_masks[:-1] all_costs = all_costs[1:] - all_costs[:-1] result = all_outputs, all_masks, all_costs trans, costs = BeamSearch.result_to_lists(result) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.beam_size return hypos
def __init__(self, src_vocab_size, trg_vocab_size, model_name, problem_name, hparams_set_name, trg_test_file, beam_size, t2t_usr_dir, checkpoint_dir, t2t_unk_id=None, n_cpu_threads=-1, max_terminal_id=-1, pop_id=-1): """Creates a new edit T2T predictor. This constructor is similar to the constructor of T2TPredictor but creates a different computation graph which retrieves scores at each target position, not only the last one. Args: src_vocab_size (int): Source vocabulary size. trg_vocab_size (int): Target vocabulary size. model_name (string): T2T model name. problem_name (string): T2T problem name. hparams_set_name (string): T2T hparams set name. trg_test_file (string): Path to a plain text file with initial target sentences. Can be empty. beam_size (int): Determines how many substitutions and insertions are considered at each position. t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor. checkpoint_dir (string): Path to the T2T checkpoint directory. The predictor will load the top most checkpoint in the `checkpoints` file. t2t_unk_id (int): If set, use this ID to get UNK scores. If None, UNK is always scored with -inf. n_cpu_threads (int): Number of TensorFlow CPU threads. max_terminal_id (int): If positive, maximum terminal ID. Needs to be set for syntax-based T2T models. pop_id (int): If positive, ID of the POP or closing bracket symbol. Needs to be set for syntax-based T2T models. """ super(EditT2TPredictor, self).__init__(t2t_usr_dir, checkpoint_dir, src_vocab_size, trg_vocab_size, t2t_unk_id, n_cpu_threads, max_terminal_id, pop_id) if not model_name or not problem_name or not hparams_set_name: logging.fatal( "Please specify t2t_model, t2t_problem, and t2t_hparams_set!") raise AttributeError if trg_vocab_size >= EditT2TPredictor.POS_FACTOR: logging.fatal("Target vocabulary size (%d) must be less than %d!" % (trg_vocab_size, EditT2TPredictor.POS_FACTOR)) raise AttributeError self.beam_size = max(1, beam_size // 10) + 1 self.batch_size = 2048 # TODO(fstahlberg): Move to config self.initial_trg_sentences = None if trg_test_file: self.initial_trg_sentences = [] with open(trg_test_file) as f: for line in f: self.initial_trg_sentences.append(utils.oov_to_unk( [int(w) for w in line.strip().split()] + [utils.EOS_ID], self.trg_vocab_size, self._t2t_unk_id)) predictor_graph = tf.Graph() with predictor_graph.as_default() as g: hparams = trainer_lib.create_hparams(hparams_set_name) self._add_problem_hparams(hparams, problem_name) translate_model = registry.model(model_name)( hparams, tf.estimator.ModeKeys.EVAL) self._inputs_var = tf.placeholder(dtype=tf.int32, shape=[None], name="sgnmt_inputs") self._targets_var = tf.placeholder(dtype=tf.int32, shape=[None, None], name="sgnmt_targets") shp = tf.shape(self._targets_var) bsz = shp[0] inputs = tf.tile(tf.expand_dims(self._inputs_var, 0), [bsz, 1]) features = {"inputs": expand_input_dims_for_t2t(inputs, batched=True), "targets": expand_input_dims_for_t2t(self._targets_var, batched=True)} translate_model.prepare_features_for_infer(features) translate_model._fill_problem_hparams_features(features) logits, _ = translate_model(features) logits = tf.squeeze(logits, [2, 3]) self._log_probs = log_prob_from_logits(logits) diag_logits = gather_2d(logits, tf.expand_dims(tf.range(bsz), 1)) self._diag_log_probs = log_prob_from_logits(diag_logits) no_pad = tf.cast(tf.not_equal( self._targets_var, text_encoder.PAD_ID), tf.float32) flat_bsz = shp[0] * shp[1] word_scores = gather_2d( tf.reshape(self._log_probs, [flat_bsz, -1]), tf.reshape(self._targets_var, [flat_bsz, 1])) word_scores = tf.reshape(word_scores, (shp[0], shp[1])) * no_pad self._sentence_scores = tf.reduce_sum(word_scores, -1) self.mon_sess = self.create_session()
def initialize(self, src_sentence): """Set src_sentence, reset consumed.""" self.consumed = [] self.src_sentence = utils.oov_to_unk( src_sentence + [text_encoder.EOS_ID], self.src_vocab_size, self._t2t_unk_id)
def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.verbose: ftrans = open(self.config['val_set_out'], 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logging.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) mb_subprocess.terminate() return bleu_score
def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logging.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) mb_subprocess.terminate() return bleu_score
def decode(self, src_sentence): """This is a generalization to NMT ensembles of ``BeamSearch.search``. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ for search in self.beam_searches: if not search.compiled: search.compile() seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( src_sentence, self.src_vocab_size)) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.beam_size, 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (self.beam_size, 1)) contexts_and_states = [] for sys_idx in xrange(self.n_networks): contexts, states, _ = \ self.beam_searches[sys_idx].compute_initial_states_and_contexts( {self.nmt_models[sys_idx].sampling_input: input_}) contexts_and_states.append((contexts, states, self.beam_searches[sys_idx])) # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = states['outputs'][None, :] all_masks = np.ones_like(all_outputs, dtype=config.floatX) all_costs = np.zeros_like(all_outputs, dtype=config.floatX) for i in range(3*len(src_sentence)): if all_masks[-1].sum() == 0: break logprobs_lst = [] for contexts, states, search in contexts_and_states: logprobs_lst.append(search.compute_logprobs(contexts, states)) logprobs = np.sum(logprobs_lst, axis=0) next_costs = (all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]) (finished,) = np.where(all_masks[-1] == 0) next_costs[finished, :utils.EOS_ID] = np.inf next_costs[finished, utils.EOS_ID + 1:] = np.inf # The `i == 0` is required because at the first step the beam # size is effectively only 1. (indexes, outputs), chosen_costs = BeamSearch._smallest( next_costs, self.beam_size, only_first_row=i == 0) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Rearrange everything for contexts, states, search in contexts_and_states: for name in states: states[name] = states[name][indexes] states.update(search.compute_next_states(contexts, states, outputs)) all_outputs = np.vstack([all_outputs, outputs[None, :]]) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != utils.EOS_ID if i == 0: mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) all_outputs = all_outputs[1:] all_masks = all_masks[:-1] all_costs = all_costs[1:] - all_costs[:-1] result = all_outputs, all_masks, all_costs trans, costs = BeamSearch.result_to_lists(result) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]] hypo.score_breakdown[0] = [(-costs[idx],1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.beam_size return hypos
def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() total_cost = 0.0 if self.verbose: ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(line[0]), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # if i < 10: # logging.info("ID: {}".format(i)) # logging.info("Source: {}".format(line[0])) # for k, tran in enumerate(trans): # logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap))) # logging.info("{}".format(costs[k])) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' trans = 0 if j == 0: # Write to subprocess and file if it exists ##print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out'])) bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8")) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) return bleu_score