def _sample(self, input_item, trng, fs_init, fs_next, gen_sample): """ Sample from model. """ # unpack input item attributes return_hyp_graph = input_item.return_hyp_graph return_alignment = input_item.return_alignment suppress_unk = input_item.suppress_unk k = input_item.k seq = input_item.seq max_ratio = input_item.max_ratio maxlen = 200 #TODO: should be configurable if max_ratio: maxlen = int(max_ratio * len(seq)) return gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape( [len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=maxlen, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph)
def _translate(seq_x1, seq_x2=None, seq_y2=None): if mm == 0: # don't use translation memory. sample, score = \ gen_sample(tparams, funcs['init_xy'], funcs['next_xy'], numpy.array(seq_x1).reshape([len(seq_x1), 1]), options, rng=trng, k=k, maxlen=d_maxlen, stochastic=options['stochastic'], argmax=True) action = [0 for _ in score] gating = [0 for _ in score] else: # sample given an input sequence and obtain scores sample, score, action, gating = \ gen_sample_multi(tparams, funcs, numpy.array(seq_x1).reshape([len(seq_x1), 1]), [numpy.array(seq_x20).reshape([len(seq_x20), 1]) for seq_x20 in seq_x2], [numpy.array(seq_y20).reshape([len(seq_y20), 1]) for seq_y20 in seq_y2], options, rng=trng, m=m, k=k, maxlen=d_maxlen, stochastic=options['stochastic'], argmax=True) # normalize scores according to sequence lengths if k > 1: if normalize: lengths = numpy.array([len(s) for s in sample]) score /= lengths # score /= (lengths ** 0.7) sidx = numpy.argmin(score) sample, score, action, gating = \ sample[sidx], score[sidx], action[sidx], gating[sidx] return sample, score, action, gating
def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample( fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[ sidx], hyp_graph
def _translate(seq): # sample given an input sequence and obtain translated result sampleData = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, return_attention=True, stochastic = False, argmax = False, normalize = normalize) sample=sampleData[0] score=sampleData[1] attention_record=sampleData[2] # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) if attention_record is None: attention=None else: attention=attention_record[sidx] return sample[sidx], attention
def _translate(seq): be_stochastic = False # sample given an input sequence and obtain scores sample, boundary, chunk, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, numpy.array(seq).reshape( [len(seq), 1]), options, trng=trng, maxlen=200, k_chunk=ck, k_word=wk, k=k, stochastic=be_stochastic, argmax=True, jointProb=False) if be_stochastic: return sample # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths # print 'score', score # print 'candidates', sample sidx = numpy.argmin(score) return sample[sidx], boundary[sidx], chunk[sidx]
def _translate(seq): sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq),1]), options, trng=trng, k=k, maxlen=200, stochastic=False) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx]
def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]),numpy.array([1],dtype=numpy.int32), options, trng=trng, k=k, maxlen=200, stochastic=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx]
def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx]
def translate(self, input_text, k=16, maxlen=50): seq = self.sent2seq(input_text) sample, scores = gen_sample(self.tparams, self.f_init, self.f_next, numpy.array(seq).reshape([len(seq), 1]), self.options, trng=self.trng, k=k, maxlen=maxlen, stochastic=False, argmax=False) results = [] sorted_index = numpy.argsort(scores) for index in sorted_index: sample_sentence = ' '.join(self.seq2words(sample[index])) results.append((scores[index], sample_sentence)) return results
def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(fs_init, fs_next, numpy.array(seq).reshape([len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score else: sidx = numpy.argmin(score) return sample[sidx]
def _translate(seq): # sample given an input sequence and obtain scores input = [numpy.array(s).T.reshape([len(s[0]), len(s), 1]) for s in seq] sample, score, word_probs, alignment = gen_sample(fs_init, fs_next, input, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]
def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best > 1: sidx = numpy.argsort(score)[:n_best] else: sidx = numpy.argmin(score) return numpy.array(sample)[sidx], numpy.array(score)[sidx]
def _translate(seq): # sample given an input sequence and obtain scores if annotations_only: next_state, ctx = f_init(numpy.array(seq).reshape([len(seq), 1])) return ctx else: sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx]
def _multi_sample(self, input_item, trng, fs_init, fs_next, gen_sample): """ Sample from model. """ # unpack input item attributes return_hyp_graph = input_item.return_hyp_graph return_alignment = input_item.return_alignment suppress_unk = input_item.suppress_unk k = input_item.k seq = input_item.seq aux_seqs = input_item.aux_seq if self._options[0]['multisource_type'] == 'init-decoder': init_decoder = True else: init_decoder = False extra_xs = [ numpy.array(aux).T.reshape([len(aux[0]), len(aux), 1]) for aux in aux_seqs ] return gen_sample( fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph, extra_xs= extra_xs, #[numpy.array(aux_seq).T.reshape([len(aux_seq[0]), len(aux_seq), 1])], init_decoder=init_decoder)
def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment = gen_sample( fs_init, fs_next, seq, trng=trng, k=int(k), maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score /= lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]
def _sample(self, input_item, trng, fs_init, fs_next, gen_sample): """ Sample from model. """ # unpack input item attributes return_hyp_graph = input_item.return_hyp_graph return_alignment = input_item.return_alignment suppress_unk = input_item.suppress_unk k = input_item.k seq = input_item.seq return gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape( [len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph)
def _translate(seq, left, right, write): # sample given an input sequence and obtain scores print left.shape, right.shape, write.shape, len(seq) sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), left[:, :, None], right[:, :, None], write, options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx]
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, batch_size = 1, opt_base=None, normalize=False, output_attention=False): trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) #load params if opt_base is None: options = load_config(model) else: options = load_config(opt_base) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list, '') tparams = init_theano_params(params) #load dictionary if dictionary is None: dictionary = options['dictionaries'][0] word_dict = load_dict(dictionary) if options['n_words_src']: for key, idx in word_dict.items(): if idx >= options['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if dictionary_target is None: dictionary_target = options['dictionaries'][1] word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() if len(words) == 0: continue retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) retval.append(x) logging.info('total %s sentences' % len(retval)) return retval, retval_ori sources, sources_ori = _send_jobs(source_file) batches = [] for i in range(len(sources) / batch_size): batches.append(prepare_data(sources[i * batch_size: (i + 1) * batch_size])) if (i + 1) * batch_size < len(sources): batches.append(prepare_data(sources[(i + 1) * batch_size: ])) final_sentences = [] f_init, f_next = build_sampler(tparams, options, use_noise, trng) for batch in batches: samples, scores, word_probs, _, _ = gen_sample([f_init], [f_next], batch[0], trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) if normalize: lengths = numpy.array([len(s) for s in samples]) scores = scores / lengths final_words = samples[numpy.argmin(scores)] final_sentences.append(' '.join([word_idict_trg[w] for w in final_words]) + '\n') with open(saveto, 'w') as fout: for sentence in final_sentences: fout.write(sentence) print 'Done'