def create_cube(self, bidx, eq_classes): # eq_classes: (score_im1, y_im1, hi, ai, loc_in_prevb) NEW cube = [] cnt_transed = len(self.translations) for whichsubcub, leq_class in eq_classes.iteritems(): # sub cube each_subcube_rowsz = len(leq_class) score_im1_r0, s_im1_r0, y_im1, y_emb_im1, _ = leq_class[0] subcube = [] subcube_line_cache = [] _avg_si, _avg_hi, _avg_ai, _avg_scores_i = None, None, None, None _cube_krank_scores_i = None _avg_sim1 = s_im1_r0 if self.ifsplit: _avg_hi = self.fn_nh(y_emb_im1, _avg_sim1) _, _avg_ai = self.fn_na(self.context, self.uh, _avg_hi) _avg_si = self.fn_ns(_avg_hi, _avg_ai) _avg_moi = self.fn_mo(y_emb_im1, _avg_ai, _avg_si) _avg_scores_i = self.fn_pws(_avg_moi, self.ptv) # the larger the better _avg_probs_i = self.fn_ce(_avg_scores_i).flatten() else: _avg_probs_i, _avg_si = self.fn_next( *[y_im1, self.context, _avg_sim1]) _avg_probs_i = _avg_probs_i.flatten() _next_krank_wids = part_sort(-_avg_probs_i, self.k - cnt_transed) _avg_ces_i = -numpy.log(_avg_probs_i[_next_krank_wids]) _cube_krank_scores_i = _cube_krank_ces_ith = _avg_ces_i self.pop_subcube_approx_cache.append( (_avg_ai, _avg_si, _cube_krank_ces_ith)) self.push_subcube_approx_cache.append(None) # add cnt for error The truth value of an array with more than one element is ambiguous for i, tup in enumerate(leq_class): if i > 1: break subcube.append([ tup + (_avg_sim1, None if _cube_krank_scores_i is None else _cube_krank_scores_i[j], wid, i, j, whichsubcub, each_subcube_rowsz) for j, wid in enumerate(_next_krank_wids) ]) subcube_line_cache.append(None) cube.append(subcube) self.subcube_lines_cache.append(subcube_line_cache) return cube
def create_cube_batch(self, bidx, eq_classes): # eq_classes: (score_im1, y_im1, hi, ai, loc_in_prevb) NEW cube = [] cnt_transed = len(self.translations) batch_y_im1, batch_s_im1 = [], [] for whichsubcub, leq_class in eq_classes.iteritems(): # sub cube each_subcube_rowsz = len(leq_class) self.prev_beam_ptrs += each_subcube_rowsz score_im1_r0, s_im1_r0, y_im1, y_im2, y_im3, _ = leq_class[0] subcube_line_cache = [] _avg_si, _avg_hi, _avg_ai, _avg_scores_i = None, None, None, None _cube_lm_krank_ces_i, _cube_krank_scores_i = None, None if each_subcube_rowsz == 1: _avg_sim1 = s_im1_r0 self.onerow_subcube_cnt += 1 else: merged_sim1 = [tup[1] for tup in leq_class[0:1]] np_merged_sim1 = numpy.array(merged_sim1) # arithmetic mean _avg_sim1 = numpy.mean(np_merged_sim1, axis=0) #print _avg_sim1 batch_y_im1.append(y_im1) batch_s_im1.append(_avg_sim1) self.push_subcube_approx_cache.append(None) np_batch_s_im1 = numpy.array(batch_s_im1, dtype='float32') subcube_num = len(batch_y_im1) ctx = numpy.tile(self.context, [subcube_num, 1]) if np_batch_s_im1.shape[0] == 1 and 3 == len(np_batch_s_im1.shape): np_batch_s_im1 = np_batch_s_im1[0] next_probs, next_states = self.fn_next( *[batch_y_im1, ctx, np_batch_s_im1]) #print next_probs.shape #print next_states.shape for which in range(len(eq_classes)): _avg_sim1, leq_class, next_prob, _avg_si = batch_s_im1[which], \ eq_classes[which], next_probs[which], next_states[which] each_subcube_rowsz = len(leq_class) next_prob_flat = next_prob.flatten() _next_krank_wids = part_sort(-next_prob_flat, self.k - len(self.translations)) k_avg_loss_flat = -numpy.log(next_prob_flat[_next_krank_wids]) self.pop_subcube_approx_cache.append( (None, _avg_hi, _avg_ai, _avg_si, None, None, _next_krank_wids, k_avg_loss_flat)) # add cnt for error The truth value of an array with more than one element is ambiguous subcube = [] for i, tup in enumerate(leq_class): subcube.append([ tup + (_avg_sim1, None if _cube_lm_krank_ces_i is None else _cube_lm_krank_ces_i[j], k_avg_loss_flat[j], wid, i, j, which, each_subcube_rowsz) for j, wid in enumerate(_next_krank_wids) ]) subcube_line_cache.append(None) #print len(subcube) cube.append(subcube) self.subcube_lines_cache.append(subcube_line_cache) self.printCube(cube) return cube
def create_cube(self, bidx, eq_classes): # eq_classes: (score_im1, y_im1, hi, ai, loc_in_prevb) NEW cube = [] cnt_transed = len(self.translations) for whichsubcub, leq_class in eq_classes.iteritems(): # sub cube each_subcube_rowsz = len(leq_class) self.prev_beam_ptrs += each_subcube_rowsz #print self.prev_beam_ptrs #if bidx >= 2 and self.prev_beam_ptrs > self.avg_bp_by_cur_step + 5: # return cube score_im1_r0, s_im1_r0, y_im1, y_im2, y_im3, _ = leq_class[0] subcube = [] subcube_line_cache = [] _avg_si, _avg_hi, _avg_ai, _avg_scores_i = None, None, None, None _cube_lm_krank_ces_i, _cube_krank_scores_i = None, None if each_subcube_rowsz == 1: _avg_sim1 = s_im1_r0 self.onerow_subcube_cnt += 1 else: merged_score_im1 = [tup[0] for tup in leq_class] merged_sim1 = [tup[1] for tup in leq_class[0:1]] np_merged_score_im1 = numpy.array(merged_score_im1, dtype='float32') np_merged_sim1 = numpy.array(merged_sim1) # arithmetic mean _avg_sim1 = numpy.mean(np_merged_sim1, axis=0) # geometric mean , not work #_avg_sim1 = numpy.power(numpy.prod(np_merged_sim1, axis=0), 1.0 / # np_merged_sim1.shape[0]) # harmonic mean #_avg_sim1 = np_merged_sim1.shape[0] / numpy.sum(1.0 / np_merged_sim1, axis=0) # weighted harmonic mean #assert(np_merged_sim1.shape[0] == np_merged_score_im1.shape[0]) #_avg_sim1 = numpy.sum(np_merged_score_im1, axis=0) / numpy.sum( # np_merged_score_im1[:,None,None] / np_merged_sim1, axis=0) # weighted mean #exp_score_im1 = numpy.exp(np_merged_score_im1 - # numpy.max(np_merged_score_im1, axis=0)) #softmax_score_im1 = exp_score_im1 / exp_score_im1.sum() #_avg_sim1 = numpy.sum(softmax_score_im1[:,None,None] * np_merged_sim1, axis=0) # quadratic mean, not work #_avg_sim1 = numpy.power(numpy.mean(numpy.power(np_merged_sim1, 2), axis=0), # 1.0 / np_merged_sim1.shape[0]) # # for tup in leq_class: watch the attention prob pi dist here .... if self.lm is not None and bidx >= 4: # TODO sort the row dimension by language model words distribution debug('sort by lm: -3 -2 -1 => {} {} {}'.format( y_im3, y_im2, y_im1)) if self.ngram == 2: gram = [y_im1] elif self.ngram == 3: gram = [y_im1] if y_im2 == -1 else [y_im2, y_im1] elif self.ngram == 4: gram = [y_im1] if y_im3 == -1 and y_im2 == -1 else ( [y_im2, y_im1] if y_im3 == -1 else [y_im3, y_im2, y_im1]) else: raise NotImplementedError lm_next_logps, next_ids = vocab_prob_given_ngram( self.lm, gram, self.tvcb, self.tvcb_i2w) np_lm_next_neg_logps = -numpy.asarray(lm_next_logps) np_next_ids = numpy.asarray(next_ids) _next_krank_ids = part_sort(np_lm_next_neg_logps, self.k - cnt_transed) _cube_lm_krank_ces_i = np_lm_next_neg_logps[_next_krank_ids] _next_krank_wids = np_next_ids[_next_krank_ids] for idx in gram: _log(idx if idx == -1 else self.tvcb_i2w[idx] + ' ', nl=False) _log('=> ', nl=False) for wid in _next_krank_wids: _log('{}({}) '.format(self.tvcb_i2w[wid], np_lm_next_neg_logps[wid]), nl=False) _log('') self.pop_subcube_approx_cache.append(None) else: # TODO sort the row dimension by average scores debug('sort by averge scores') _y_emb_im1, _avg_hi = self.fn_nh(y_im1, _avg_sim1) _, _avg_ai = self.fn_na(self.context, self.uh, _avg_hi) _avg_si = self.fn_ns(_avg_hi, _avg_ai) _avg_moi = self.fn_mo(_y_emb_im1, _avg_ai, _avg_si) _avg_scores_i = self.fn_pws(_avg_moi, self.ptv) # the larger the better _avg_ces_i = self.fn_ce(_avg_scores_i).flatten() _next_krank_wids = part_sort(_avg_ces_i, self.k - cnt_transed) _cube_krank_scores_i = _cube_krank_ces_ith = _avg_ces_i[ _next_krank_wids] self.pop_subcube_approx_cache.append( (_y_emb_im1, _avg_hi, _avg_ai, _avg_si, _avg_moi, _avg_scores_i, _next_krank_wids, _cube_krank_ces_ith)) self.push_subcube_approx_cache.append(None) # add cnt for error The truth value of an array with more than one element is ambiguous for i, tup in enumerate(leq_class): subcube.append([ tup + (_avg_sim1, None if _cube_lm_krank_ces_i is None else _cube_lm_krank_ces_i[j], None if _cube_krank_scores_i is None else _cube_krank_scores_i[j], wid, i, j, whichsubcub, each_subcube_rowsz) for j, wid in enumerate(_next_krank_wids) ]) subcube_line_cache.append(None) cube.append(subcube) self.subcube_lines_cache.append(subcube_line_cache) self.printCube(cube) return cube
def create_cube(self, bidx, eq_classes): # eq_classes: (score_im1, y_im1, hi, ai, loc_in_prevb) NEW cube = [] cnt_transed = len(self.translations) for whichsubcub, leq_class in eq_classes.iteritems(): # sub cube each_subcube_rowsz = len(leq_class) score_im1_r0, s_im1_r0, y_im1, y_im2, y_im3, _ = leq_class[0] subcube = [] subcube_line_mergeout = [] _avg_si, _avg_hi, _avg_ai, _avg_scores_i = None, None, None, None _cube_lm_krank_ces_i, _cube_krank_scores_i = None, None if each_subcube_rowsz == 1: _avg_sim1 = s_im1_r0 else: merged_sim1 = [tup[1] for tup in leq_class] _avg_sim1 = numpy.mean(numpy.array(merged_sim1), axis=0) # for tup in leq_class: watch the attention prob pi dist here .... if self.lm is not None and bidx >= 4: # TODO sort the row dimension by language model words distribution debug('sort by lm: -3 -2 -1 => {} {} {}'.format( y_im3, y_im2, y_im1)) if self.ngram == 2: gram = [y_im1] elif self.ngram == 3: gram = [y_im1] if y_im2 == -1 else [y_im2, y_im1] elif self.ngram == 4: gram = [y_im1] if y_im3 == -1 and y_im2 == -1 else ( [y_im2, y_im1] if y_im3 == -1 else [y_im3, y_im2, y_im1]) else: raise NotImplementedError lm_next_logps, next_wids = vocab_prob_given_ngram( self.lm, gram, self.tvcb, self.tvcb_i2w) np_lm_next_logps = numpy.asarray(lm_next_logps) np_next_wids = numpy.asarray(next_wids) np_lm_next_neg_logps = -np_lm_next_logps _next_krank_ids = part_sort(np_lm_next_neg_logps, self.k - cnt_transed) _cube_lm_krank_ces_i = np_lm_next_neg_logps[_next_krank_ids] _next_krank_wids = np_next_wids[_next_krank_ids] for idx in gram: _log(idx if idx == -1 else self.tvcb_i2w[idx] + ' ', nl=False) _log('=> ', nl=False) for wid in _next_krank_wids: _log('{}({}) '.format(self.tvcb_i2w[wid], np_lm_next_neg_logps[wid]), nl=False) _log('') self.approx_items.append(None) else: # TODO sort the row dimension by average scores debug('sort by averge scores') _y_emb_im1, _avg_hi = self.fn_nh(y_im1, _avg_sim1) _, _avg_ai = self.fn_na(self.context, self.uh, _avg_hi) _avg_si = self.fn_ns(_avg_hi, _avg_ai) _avg_moi = self.fn_mo(_y_emb_im1, _avg_ai, _avg_si) _avg_scores_i = self.fn_pws(_avg_moi, self.ptv) # the larger the better _avg_scores_i_flat = _avg_scores_i.flatten() _next_krank_ids = part_sort(-_avg_scores_i_flat, self.k - cnt_transed) _next_krank_wids = _next_krank_ids _cube_krank_scores_i = _avg_scores_i_flat[_next_krank_wids] #_avg_ces_i = self.fn_ce(_avg_scores_i).flatten() #_cube_krank_scores_i = _avg_ces_i[_next_krank_wids] self.approx_items.append( (_y_emb_im1, _avg_hi, _avg_ai, _avg_si, _avg_moi, _avg_scores_i, _next_krank_wids)) # add cnt for error The truth value of an array with more than one element is ambiguous for i, tup in enumerate(leq_class): subcube.append([ tup + (_avg_sim1, None if _cube_lm_krank_ces_i is None else _cube_lm_krank_ces_i[j], None if _cube_krank_scores_i is None else _cube_krank_scores_i[j], wid, i, j, whichsubcub, each_subcube_rowsz) for j, wid in enumerate(_next_krank_wids) ]) subcube_line_mergeout.append(None) cube.append(subcube) self.cube_lines_mergeout.append(subcube_line_mergeout) # print created cube before generating current beam for debug ... debug( '\n################################ CUBE ################################' ) nsubcube = len(cube) debug('MERGE => ', nl=False) for subcube_id in xrange(nsubcube): nmergings = len(cube[subcube_id]) debug('{} '.format(nmergings), nl=False) debug('') for subcube_id in xrange(nsubcube): subcube = cube[subcube_id] nmergings = len(subcube) debug('Group: {} contains {} mergings:'.format( subcube_id, nmergings)) for mergeid in xrange(nmergings): line_in_subcube = subcube[mergeid] first_item = line_in_subcube[0] score_im1, y_im1 = first_item[0], first_item[2] y_im1_w = None if y_im1 == -1 else self.tvcb_i2w[y_im1] debug('{}={}({: >7}) => '.format(y_im1, y_im1_w, format(score_im1, '0.2f')), nl=False) for cubetup in line_in_subcube: wid = cubetup[-5] lm_score = cubetup[-7] model_score = cubetup[-6] debug('{}={}({: >5}&+{: >5}={: >5}) | '.format( wid, self.tvcb_i2w[wid], None if lm_score is None else format(lm_score, '0.2f'), None if model_score is None else format( model_score, '0.2f'), None if model_score is None else format( score_im1 + model_score, '0.2f')), nl=False) debug('') debug( '######################################################################' ) return cube
def create_cube_batch(self, bidx, eq_classes): # eq_classes: (score_im1, y_im1, hi, ai, loc_in_prevb) NEW cube = [] cnt_transed = len(self.translations) batch_y_im1, batch_s_im1, batch_y_emb = [], [], [] for whichsubcub, leq_class in eq_classes.iteritems(): # sub cube each_subcube_rowsz = len(leq_class) score_im1_r0, s_im1_r0, y_im1, y_emb_im1, _ = leq_class[0] if len(s_im1_r0.shape) == 2: s_im1_r0 = s_im1_r0[0] subcube_line_cache = [] _cube_krank_scores_i = None batch_y_im1.append(y_im1) batch_s_im1.append(s_im1_r0) batch_y_emb.append(y_emb_im1[0]) self.push_subcube_approx_cache.append(None) np_batch_s_im1 = numpy.array(batch_s_im1, dtype='float32') #np_batch_y_im1 = numpy.array(batch_y_im1) np_batch_y_emb = numpy.array(batch_y_emb, dtype='float32') subcube_num = len(batch_y_im1) ctx = numpy.tile(self.context, [subcube_num, 1]) uh = numpy.tile(self.uh, [subcube_num, 1]) if np_batch_s_im1.shape[0] == 1 and 3 == len(np_batch_s_im1.shape): np_batch_s_im1 = np_batch_s_im1[0] _avg_si, _avg_hi, _avg_ai, _avg_scores_i = None, None, None, None if self.ifsplit: _avg_hi = self.fn_nh(np_batch_y_emb, np_batch_s_im1) _, _avg_ai = self.fn_na(ctx, uh, _avg_hi) next_states = self.fn_ns(_avg_hi, _avg_ai) _avg_moi = self.fn_mo(np_batch_y_emb, _avg_ai, next_states) _avg_scores_i = self.fn_pws(_avg_moi, self.ptv) # the larger the better next_probs = self.fn_ce(_avg_scores_i) else: next_probs, next_states = self.fn_next( *[batch_y_im1, ctx, np_batch_s_im1]) for which in range(len(eq_classes)): _avg_sim1, leq_class, next_prob = batch_s_im1[which], \ eq_classes[which], next_probs[which] _avg_si = next_states if len( next_states) == 1 else next_states[which] each_subcube_rowsz = len(leq_class) next_prob_flat = next_prob.flatten() _next_krank_wids = part_sort(-next_prob_flat, self.k - len(self.translations)) k_avg_loss_flat = -numpy.log(next_prob_flat[_next_krank_wids]) self.pop_subcube_approx_cache.append( (_avg_ai, _avg_si, k_avg_loss_flat)) # add cnt for error The truth value of an array with more than one element is ambiguous subcube = [] for i, tup in enumerate(leq_class): #if i > 1: break subcube.append([ tup + (_avg_sim1, k_avg_loss_flat[j], wid, i, j, which, each_subcube_rowsz) for j, wid in enumerate(_next_krank_wids) ]) subcube_line_cache.append(None) cube.append(subcube) self.subcube_lines_cache.append(subcube_line_cache) self.printCube(cube) return cube
def original_trans(self, x): x = x[0] if self.ifvalid else x # numpy ndarray # subdict set [0,2,6,29999, 333] self.ptv = numpy.asarray( x[1], dtype='int32') if self.ifvalid and self.ifmv else None # k is the beam size we have x = numpy.asarray(x, dtype='int64') if x.ndim == 1: x = x[None, :] src_sent_len = x.shape[1] maxlen = src_sent_len * 2 x = x.T sample = [] sample_score = [] live_k = 1 dead_k = 0 hyp_samples = [[]] * live_k hyp_scores = numpy.zeros(live_k).astype('float32') hyp_states = [] # get initial state of decoder rnn and encoder context s_im1, ctx0, c_x0 = self.fn_init(x) y_im1 = [-1] # indicator for the first target word (bos target) for ii in xrange(maxlen): # (src_sent_len, 1, 2*src_nhids) -> (src_sent_len, live_k, 2*src_nhids) ctx = numpy.tile(ctx0, [live_k, 1]) debug('ctx') debug(ctx) c_x = numpy.tile(c_x0, [live_k, 1]) debug('y_im1.................................................') debug(y_im1) debug('s_im1.................................................') debug(s_im1) yemb_im1, hi = self.fn_nh(y_im1, s_im1) debug('hi.................................................') debug(hi) pi, ai = self.fn_na(ctx, c_x, hi) debug('pi.................................................') debug(pi) debug('ai.................................................') debug(ai) s_im1 = s_i = self.fn_ns(hi, ai) # note, s_im1 should be updated! debug('si') debug(s_i) mo = self.fn_mo(yemb_im1, ai, s_i) next_scores = self.fn_pws(mo, self.ptv) # the larger the better next_ces = -next_scores if self.ifscore else self.fn_ce( next_scores) #cand_scores = hyp_scores[:, None] - numpy.log(next_scores) cand_scores = hyp_scores[:, None] + next_ces debug(str(ii) + ' ===============================================') debug('ce... i') debug(next_ces) cand_flat = cand_scores.flatten() # ranks_flat = cand_flat.argsort()[:(k-dead_k)] # we do not need to generate k candidate here, because we just need to generate k-dead_k # more candidates ending with eos, so for each previous candidate we just need to expand # k-dead_k candidates ranks_flat = part_sort(cand_flat, self.k - dead_k) # print ranks_flat, cand_flat[ranks_flat[1]], cand_flat[ranks_flat[8]] voc_size = next_scores.shape[1] trans_indices = ranks_flat // voc_size word_indices = ranks_flat % voc_size costs = cand_flat[ranks_flat] debug('ce... prev i') debug(costs) new_hyp_samples = [] new_hyp_scores = numpy.zeros(self.k - dead_k).astype('float32') new_hyp_states = [] for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): new_hyp_samples.append(hyp_samples[ti] + [wi]) new_hyp_scores[idx] = copy.copy(costs[idx]) new_hyp_states.append(copy.copy( s_i[ti])) # here should be s_i !!! # check the finished samples new_live_k = 0 hyp_samples = [] hyp_scores = [] hyp_states = [] # current beam, if the hyposise ends with eos, we do not for idx in xrange(len(new_hyp_samples)): if new_hyp_samples[idx][-1] == self.eos_id: sample.append(new_hyp_samples[idx]) sample_score.append(new_hyp_scores[idx]) # print new_hyp_scores[idx], new_hyp_samples[idx] dead_k += 1 else: new_live_k += 1 hyp_samples.append(new_hyp_samples[idx]) hyp_scores.append(new_hyp_scores[idx]) hyp_states.append(new_hyp_states[idx]) hyp_scores = numpy.array(hyp_scores) live_k = new_live_k debug('hyp_scores... prev i') debug(hyp_scores) debug('hyp_samples... prev i') for hyp_sample in hyp_samples: debug(hyp_sample) if new_live_k < 1: break if dead_k >= self.k: break y_im1 = numpy.array([w[-1] for w in hyp_samples]) s_im1 = numpy.array(hyp_states) if live_k > 0: for idx in xrange(live_k): sample.append(hyp_samples[idx]) sample_score.append(hyp_scores[idx]) if self.ifnorm: lengths = numpy.array([len(s) for s in sample]) avg_sample_score = sample_score / lengths else: avg_sample_score = sample_score sidx = numpy.argmin(avg_sample_score) best_sum_loss = sample_score[sidx] best_avg_loss = avg_sample_score[sidx] best_trans = sample[sidx] _log( '@source length[{}], translation length(with eos)[{}], maxlen[{}], avg loss' '[{}]={}/{}'.format(src_sent_len, len(best_trans), maxlen, avg_sample_score[sidx], sample_score[sidx], lengths[sidx])) _log('init[{}] nh[{}] na[{}] ns[{}] mo[{}] ws[{}] ps[{}] p[{}]'.format( *self.lqc)) return _filter_reidx(self.bos_id, self.eos_id, best_trans, self.tvcb_i2w, self.ifmv, self.ptv)
sys.stderr.write('use {}-gram langauge model\n'.format(lm.order)) state_in = kenlm.State() lm.NullContextWrite(state_in) v_prev_ngram_w = ['it', 'is', 'revealed'] v_prev_ngram_w = ['bolivia', 'holds', 'presidential', 'and'] v_prev_ngram_w = ['organization', 'of', 'american', 'states'] v_prev_ngram_w = ['according', 'the'] probs, wids = vocab_prob_given_ngram( lm, v_prev_ngram_w, trg_vocab, trg_vocab_i2w, given=False, wid=False) np_probs = numpy.asarray(probs) np_wids = numpy.asarray(wids) probs_id = part_sort(-np_probs, 10) # print probs_id print np_probs[probs_id] print np_wids[probs_id] for i in np_wids[probs_id]: print trg_vocab_i2w[i], # print probs ''' i = 0 _k_rank_idx = part_sort(nprobs, 10) _k_ith_neg_log_prob = nprobs[_k_rank_idx] print _k_ith_neg_log_prob for idx in _k_rank_idx: print words[idx], print
def beam_search_comb(self, np_src_sent): maxlen = self.maxlen hyp_scores = np.zeros(1).astype('float32') s_init, ctx0, c_x0 = self.fn_init( np_src_sent) # np_src_sent (sl, 1), beam==1 detail = False y_emb_im1 = self.fn_emb([-1]) init_beam_sm(self.beam, cnt=maxlen, init_state=s_init[0], init_y_emb_im1=y_emb_im1) for i in range(1, maxlen + 1): # beam search here if (i - 1) % 10 == 0: debug(str(i - 1)) prevb = self.beam[i - 1] len_prevb = len(prevb) cands = [] # batch states of previous beam s_im1 = np.array([b[1] for b in prevb]) yemb_im1 = np.array([b[3][0] for b in prevb]) # (src_sent_len, 1, 2*src_nhids) -> (src_sent_len, len_prevb, 2*src_nhids) context = np.tile(ctx0, [len_prevb, 1]) c_x = np.tile(c_x0, [len_prevb, 1]) if self.ifsplit: #yemb_im1, hi = self.fn_nh(y_im1, s_im1) #pi, ai = self.fn_na(context, c_x, hi) #si = self.fn_ns(hi, ai) #mo = self.fn_mo(yemb_im1, ai, si) #next_scores = self.fn_pws(mo, self.ptv) #next_probs = -next_scores if self.ifscore else self.fn_ce(next_scores) hi = self.fn_nh(yemb_im1, s_im1) pi, ai = self.fn_na(context, c_x, hi) si = self.fn_ns(hi, ai) mo = self.fn_mo(yemb_im1, ai, si) next_scores = self.fn_pws(mo, self.ptv) next_probs = -next_scores if self.ifscore else self.fn_ce( next_scores) else: y_im1 = np.array([b[2] for b in prevb]) next_probs, si = self.fn_next(*[y_im1, context, s_im1]) next_ces = -np.log(next_probs) cand_scores = hyp_scores[:, None] + next_ces cand_scores_flat = cand_scores.flatten() ranks_flat = part_sort(cand_scores_flat, self.k - len(self.translations)) voc_size = next_ces.shape[1] prevb_id = ranks_flat // voc_size word_indices = ranks_flat % voc_size costs = cand_scores_flat[ranks_flat] for b in zip(costs, si[prevb_id], word_indices, prevb_id): if b[2] == self.eos_id: if self.ifnorm: self.translations.append(((b[0] / i), b[0]) + b[2:] + (i, )) else: self.translations.append((b[0], ) + b[2:] + (i, )) if len(self.translations) == self.k: # output sentence, early stop, best one in k debug('early stop! see {} samples ending with EOS.'. format(self.k)) avg_bp = format(self.locrt[0] / self.locrt[1], '0.3f') debug('average location of back pointers [{}/{}={}]'. format(self.locrt[0], self.locrt[1], avg_bp)) sorted_samples = sorted(self.translations, key=lambda tup: tup[0]) best_sample = sorted_samples[0] debug('translation length(with EOS) [{}]'.format( best_sample[-1])) for sample in sorted_samples: # tuples debug('{}'.format(sample)) return back_tracking(self.beam, best_sample, detail) else: # should calculate when generate item in current beam self.locrt[0] += (b[-1] + 1) self.locrt[1] += 1 self.beam[i].append( (b[0], b[1], b[2], self.fn_emb([b[2]]), b[3])) debug('beam {} ----------------------------'.format(i)) for b in self.beam[i]: debug(b[0:1] + b[2:]) # do not output state hyp_scores = np.array([b[0] for b in self.beam[i]]) # no early stop, back tracking avg_bp = format(self.locrt[0] / self.locrt[1], '0.3f') debug('average location of back pointers [{}/{}={}]'.format( self.locrt[0], self.locrt[1], avg_bp)) if len(self.translations) == 0: debug('no early stop, no candidates ends with EOS, selecting from ' 'len {} candidates, may not end with EOS.'.format(maxlen)) best_sample = ((self.beam[maxlen][0][0], ) + self.beam[maxlen][0][2:] + (maxlen, )) debug('translation length(with EOS) [{}]'.format(best_sample[-1])) return back_tracking(self.beam, best_sample, detail) else: debug( 'no early stop, not enough {} candidates end with EOS, selecting the best ' 'sample ending with EOS from {} samples.'.format( self.k, len(self.translations))) sorted_samples = sorted(self.translations, key=lambda tup: tup[0]) best_sample = sorted_samples[0] debug('translation length(with EOS) [{}]'.format(best_sample[-1])) for sample in sorted_samples: # tuples debug('{}'.format(sample)) return back_tracking(self.beam, best_sample, detail)
def beam_search(self, np_src_sent): maxlen = self.maxlen s_init, context, c_x = self.fn_init( np_src_sent) # np_src_sent (sl, 1), beam==1 # (1, trg_nhids), (src_len, 1, src_nhids*2) detail = False y_emb_im1 = self.fn_emb([-1]) init_beam_sm(self.beam, cnt=maxlen, init_state=s_init, init_y_emb_im1=y_emb_im1) for i in range(1, maxlen + 1): if (i - 1) % 10 == 0: debug(str(i - 1)) cands = [] for j in xrange(len(self.beam[i - 1])): # size of last beam # (45.32, (beam, trg_nhids), -1, 0) #accum_loss_im1, accum_im1, _, s_im1, y_im1, bp_im1 = self.beam[i - 1][j] accum_im1, s_im1, y_im1, yemb_im1, bp_im1 = self.beam[i - 1][j] if self.ifsplit: #yemb_im1, hi = self.fn_nh(y_im1, s_im1) #pi, ai = self.fn_na(context, c_x, hi) # pi: (src_len, ) sum == 1 #si = self.fn_ns(hi, ai) #mo = self.fn_mo(yemb_im1, ai, si) #next_scores = self.fn_pws(mo, self.ptv) #next_probs = -next_scores if self.ifscore else self.fn_ce(next_scores) hi = self.fn_nh(yemb_im1, s_im1) _, ai = self.fn_na(context, c_x, hi) # pi: (src_len, ) sum == 1 si = self.fn_ns(hi, ai) mo = self.fn_mo(yemb_im1, ai, si) next_scores = self.fn_pws(mo, self.ptv) next_probs = -next_scores if self.ifscore else self.fn_ce( next_scores) else: next_probs, si = self.fn_next(*[y_im1, context, s_im1]) next_ces = -np.log(next_probs) next_ces_flat = next_ces.flatten() # (1,vocsize) -> (vocsize,) ranks_idx_flat = part_sort(next_ces_flat, self.k - len(self.translations)) #ranks_idx_flat = part_sort(next_ces_flat, self.k) k_avg_loss_flat = next_ces_flat[ ranks_idx_flat] # -log_p_y_given_x # for idx in ranks_idx_flat: # print self.tvcb_i2w[idx], # print '\n' accum_i = accum_im1 + k_avg_loss_flat #accum_loss_i = self.loss_with_nlcp(accum_i, pi, bp_im1, j, i) #cands += [(accum_loss_i[idx], accum_i[idx], pi, si, wid, j) # for idx, wid in enumerate(ranks_idx_flat)] cands += [(accum_i[idx], si, wid, self.fn_emb([wid]), j) for idx, wid in enumerate(ranks_idx_flat)] k_ranks_flat = part_sort( np.asarray([cand[0] for cand in cands] + [np.inf]), self.k - len(self.translations)) #k_ranks_flat = part_sort(np.asarray( # [cand[0] for cand in cands] + [np.inf]), self.k) k_sorted_cands = [cands[r] for r in k_ranks_flat] for b in k_sorted_cands: if b[2] == self.eos_id: debug('add: {}'.format(((b[0] / i), b[0]) + b[-2:] + (i, ))) if self.ifnorm: self.translations.append(((b[0] / i), b[0]) + b[-2:] + (i, )) else: self.translations.append((b[0], ) + b[-2:] + (i, )) if len(self.translations) == self.k: # output sentence, early stop, best one in k debug('early stop! see {} samples ending with EOS.'. format(self.k)) avg_bp = format(self.locrt[0] / self.locrt[1], '0.3f') debug('average location of back pointers [{}/{}={}]'. format(self.locrt[0], self.locrt[1], avg_bp)) sorted_samples = sorted(self.translations, key=lambda tup: tup[0]) best_sample = sorted_samples[0] debug('translation length(with EOS) [{}]'.format( best_sample[-1])) for sample in sorted_samples: # tuples debug('{}'.format(sample)) return back_tracking(self.beam, best_sample, detail) else: # should calculate when generate item in current beam self.locrt[0] += (b[-1] + 1) self.locrt[1] += 1 self.beam[i].append(b) debug('beam {} ----------------------------'.format(i)) for b in self.beam[i]: debug(b[0:2] + b[-2:]) # do not output state # no early stop, back tracking avg_bp = format(self.locrt[0] / self.locrt[1], '0.3f') debug('average location of back pointers [{}/{}={}]'.format( self.locrt[0], self.locrt[1], avg_bp)) if len(self.translations) == 0: debug('no early stop, no candidates ends with EOS, selecting from ' 'len {} candidates, may not end with EOS.'.format(maxlen)) best_sample = (self.beam[maxlen][0][0], ) + self.beam[maxlen][0][-2:] + (maxlen, ) debug('translation length(with EOS) [{}]'.format(best_sample[-1])) return back_tracking(self.beam, best_sample, detail) else: debug( 'no early stop, not enough {} candidates end with EOS, selecting the best ' 'sample ending with EOS from {} samples.'.format( self.k, len(self.translations))) sorted_samples = sorted(self.translations, key=lambda tup: tup[0]) best_sample = sorted_samples[0] debug('translation length(with EOS) [{}]'.format(best_sample[-1])) for sample in sorted_samples: # tuples debug('{}'.format(sample)) return back_tracking(self.beam, best_sample, detail)