def backtrack_k(bins,paths): partial_scores = [] translated_sentences = [] tags = heap_lm_k.FT.tags for path in paths: partial_score = bins[0].data[0].partial_score translated_sentence = [] father_key = bins[0].data[0].getKey() for i in xrange(len(path)): bin_id = path[i][0] state_id = path[i][1] ft_id = path[i][2] state = bins[bin_id].data[state_id] father_tuple = state.fathers[ft_id] assert( father_key == father_tuple[tags.FATHER_KEY].getKey()) e_pharse = father_tuple[tags.E_PHRASE] ps = father_tuple[tags.PARTIAL_SCORE] partial_score = array_plus(partial_score,ps) translated_sentence.append(e_pharse) father_key = state.getKey() partial_scores.append(partial_score) translated_sentences.append(translated_sentence) return partial_scores, translated_sentences
def decode_k(f_sentence, phrase_table, seen_words, lm_model, lm_weight, d_weight, d_limit, beam_size, num_feature, tempFilePath, debug=False, k_best = 1): '''generate the k-best translations return partial_scores, translated_sentences partial_scores: [[feature_value]*number_features] * k_best translated_sentences : [[e_phrase] * num_phrases ] * k_best ''' start = datetime.now() # using monotone to generate future cost and transition options fcost, options, new_phrase_table = future_cost_and_options( f_sentence, phrase_table, seen_words, num_feature) new_phrase_table = lmize(lm_model, lm_weight, new_phrase_table) if debug: for key in new_phrase_table: s = new_phrase_table[key][0] print key, s print sorted(options) # beam search bins = [] for i in xrange(len(f_sentence) + 1): b = heap_lm_k.Heap(beam_size,k_best) bins.append(b) empty = heap_lm_k.State((), 0, '<s>', 0, 0,[0.0]*num_feature) bins[0].add(empty) for i in xrange(len(f_sentence)): b = bins[i] for k in xrange(b.size): state = b.data[k] for option in options: cover = state.cover j = state.j d = option[0] - j last_e = state.last_e if (not cross(cover, option)) and (abs(d) <= d_limit): # whether current f_pharse is the last part un-translated is_end = False new_cover = merge(cover, option) if len(new_cover) == len(f_sentence): is_end = True # reorder the e_pharse based on last_e LM score f_phrase = tuple(f_sentence[option[0]:option[1]]) items = new_phrase_table[f_phrase] items = lm_order_list( lm_model, items, lm_weight, last_e, is_end,limit = 10) # add into the bins for item in items: t_score = item[0] # lm score has already incorporated d_score = - abs(d) * d_weight # note! score = d_score + t_score + state.s partial_score = list(item[2]) partial_score.append( - abs(d) ) new_partial_score = array_plus(partial_score, state.partial_score) e_phrase = item[1] new_last_e = e_phrase[-1] new_j = option[1] new_h = get_future_cost(fcost, new_cover, len(f_sentence)) child_state = heap_lm_k.State( new_cover, new_j, new_last_e, score, new_h, new_partial_score) child_state.e_phrase = e_phrase delta = (child_state.f, child_state.s - state.s, child_state.e_phrase,array_minus(child_state.partial_score, state.partial_score),state) child_state.fathers = [delta,] n_cover = len(new_cover) bins[n_cover].add(child_state) if debug: for b in bins: print b.data paths = _mcr.get_k_best_paths(bins,k_best,tempFilePath) partial_scores, translated_sentence = _mcr.backtrack_k(bins,paths) end = datetime.now() print end-start return partial_scores, translated_sentence
def decode_k(f_sentence, phrase_table, seen_words, lm_model, lm_weight, d_weight, d_limit, beam_size, num_feature, tempFilePath, debug=False, k_best=1): '''generate the k-best translations return partial_scores, translated_sentences partial_scores: [[feature_value]*number_features] * k_best translated_sentences : [[e_phrase] * num_phrases ] * k_best ''' start = datetime.now() # using monotone to generate future cost and transition options fcost, options, new_phrase_table = future_cost_and_options( f_sentence, phrase_table, seen_words, num_feature) new_phrase_table = lmize(lm_model, lm_weight, new_phrase_table) if debug: for key in new_phrase_table: s = new_phrase_table[key][0] print key, s print sorted(options) # beam search bins = [] for i in xrange(len(f_sentence) + 1): b = heap_lm_k.Heap(beam_size, k_best) bins.append(b) empty = heap_lm_k.State((), 0, '<s>', 0, 0, [0.0] * num_feature) bins[0].add(empty) for i in xrange(len(f_sentence)): b = bins[i] for k in xrange(b.size): state = b.data[k] for option in options: cover = state.cover j = state.j d = option[0] - j last_e = state.last_e if (not cross(cover, option)) and (abs(d) <= d_limit): # whether current f_pharse is the last part un-translated is_end = False new_cover = merge(cover, option) if len(new_cover) == len(f_sentence): is_end = True # reorder the e_pharse based on last_e LM score f_phrase = tuple(f_sentence[option[0]:option[1]]) items = new_phrase_table[f_phrase] items = lm_order_list(lm_model, items, lm_weight, last_e, is_end, limit=10) # add into the bins for item in items: t_score = item[0] # lm score has already incorporated d_score = -abs(d) * d_weight # note! score = d_score + t_score + state.s partial_score = list(item[2]) partial_score.append(-abs(d)) new_partial_score = array_plus(partial_score, state.partial_score) e_phrase = item[1] new_last_e = e_phrase[-1] new_j = option[1] new_h = get_future_cost(fcost, new_cover, len(f_sentence)) child_state = heap_lm_k.State(new_cover, new_j, new_last_e, score, new_h, new_partial_score) child_state.e_phrase = e_phrase delta = (child_state.f, child_state.s - state.s, child_state.e_phrase, array_minus(child_state.partial_score, state.partial_score), state) child_state.fathers = [ delta, ] n_cover = len(new_cover) bins[n_cover].add(child_state) if debug: for b in bins: print b.data paths = _mcr.get_k_best_paths(bins, k_best, tempFilePath) partial_scores, translated_sentence = _mcr.backtrack_k(bins, paths) end = datetime.now() print end - start return partial_scores, translated_sentence