def _get_top_k_sequences(self, log_probs, wordpiece_mask, k): batch_size = log_probs.size()[0] seq_length = log_probs.size()[1] beam_search = BeamSearch(self._end_index, max_steps=seq_length, beam_size=k, per_node_beam_size=self._per_node_beam_size) beam_log_probs = torch.nn.functional.pad( log_probs, pad=(0, 2, 0, 0, 0, 0), value=-1e7 ) # add low log probabilites for start and end tags used in the beam search start_predictions = beam_log_probs.new_full( (batch_size, ), fill_value=self._start_index).long() # Shape: (batch_size, beam_size, seq_length) top_k_predictions, seq_log_probs = beam_search.search( start_predictions, { 'log_probs': beam_log_probs, 'wordpiece_mask': wordpiece_mask, 'step_num': beam_log_probs.new_zeros((batch_size, )).long() }, self.take_step) # get rid of start and end tags if they slipped in top_k_predictions[top_k_predictions > 2] = 0 return top_k_predictions
def _init_graph(self): """ init graph """ self.ys = (self.input_y, None, None) self.xs = (self.input_x, None) self.memory = self.model.encode(self.xs, False)[0] self.logits = self.model.decode(self.xs, self.ys, self.memory, False)[0] ckpt = self.tf.train.get_checkpoint_state(self.model_dir).all_model_checkpoint_paths[-1] graph = self.logits.graph sess_config = self.tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True saver = self.tf.train.Saver() self.sess = self.tf.Session(config=sess_config, graph=graph) self.sess.run(self.tf.global_variables_initializer()) self.tf.reset_default_graph() saver.restore(self.sess, ckpt) self.bs = BeamSearch(self.model, self.hp.beam_size, list(self.idx2token.keys())[2], list(self.idx2token.keys())[3], self.idx2token, self.hp.maxlen2, self.input_x, self.input_y, self.logits)
def beam_sample(self, image_features, beam_size=5): batch_size = image_features.size(0) beam_searcher = BeamSearch(beam_size, batch_size, 17) # init the result with zeros and lstm states states = self.init_hidden_noise(image_features) states = (states[0].repeat(1, beam_size, 1).cuda(), states[1].repeat(1, beam_size, 1).cuda()) # embed the start symbol words_feed = self.embed.word_embeddings([self.embed.START_SYMBOL] * batch_size) \ .repeat(beam_size, 1).unsqueeze(1).cuda() for i in range(self.max_sentence_length): hidden, states = self.lstm(words_feed, states) outputs = self.output_linear(hidden.squeeze(1)) beam_indices, words_indices = beam_searcher.expand_beam( outputs=outputs) if len(beam_indices) == 0 or i == 15: generated_captions = beam_searcher.get_results()[:, 0] outcaps = self.embed.words_from_indices( generated_captions.cpu().numpy()) else: words_feed = torch.stack([ self.embed.word_embeddings_from_indices(words_indices) ]).view(beam_size, 1, -1).cuda() return " ".join(outcaps) # .split(self.embed.END_SYMBOL)[0]
def beam_search(self, input_seq, beam_size=3, attentionOverrideMap=None, correctionMap=None, unk_map=None, beam_length=0.5, beam_coverage=0.5, max_length=MAX_LENGTH): torch.set_grad_enabled(False) input_seqs = [indexes_from_sentence(self.input_lang, input_seq)] input_lengths = [len(seq) for seq in input_seqs] input_batches = Variable(torch.LongTensor(input_seqs)).transpose(0, 1) if use_cuda: input_batches = input_batches.cuda() self.encoder.train(False) self.decoder.train(False) encoder_outputs, encoder_hidden = self.encoder(input_batches, input_lengths, None) decoder_hidden = encoder_hidden beam_search = BeamSearch(self.decoder, encoder_outputs, decoder_hidden, self.output_lang, beam_size, attentionOverrideMap, correctionMap, unk_map, beam_length=beam_length, beam_coverage=beam_coverage) result = beam_search.search() self.encoder.train(True) self.decoder.train(True) torch.set_grad_enabled(True) return result # Return a list of indexes, one for each word in the sentence, plus EOS
def rnn_generate(gen_input_file, model_path, max_gen_len, beam_size, word_dict_file): """ use RNN model to generate sequences. :param word_id_dict: vocab. :type word_id_dict: dictionary with content of "{word, id}", "word" is string type , "id" is int type. :param num_words: the number of the words to generate. :type num_words: int :param beam_size: beam width. :type beam_size: int :return: save prediction results to output_file """ assert os.path.exists(gen_input_file), "test file does not exist!" assert os.path.exists(model_path), "trained model does not exist!" assert os.path.exists( word_dict_file), "word dictionary file does not exist!" # load word dictionary word_2_ids = load_dict(word_dict_file) try: UNK_ID = word_2_ids["<unk>"] except KeyError: logger.fatal("the word dictionary must contain a <unk> token!") sys.exit(-1) # initialize paddle paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count) # load the trained model pred_words = rnn_lm( len(word_2_ids), conf.emb_dim, conf.hidden_size, conf.stacked_rnn_num, conf.rnn_type, is_infer=True) parameters = paddle.parameters.Parameters.from_tar( gzip.open(model_path, "r")) inferer = paddle.inference.Inference( output_layer=pred_words, parameters=parameters) generator = BeamSearch(inferer, word_dict_file, beam_size, max_gen_len) # generate text with open(conf.gen_file, "r") as fin, open(conf.gen_result, "w") as fout: for idx, line in enumerate(fin): fout.write("%d\t%s" % (idx, line)) for gen_res in generator.gen_a_sentence([ word_2_ids.get(w, UNK_ID) for w in line.lower().strip().split() ]): fout.write("%s\n" % gen_res) fout.write("\n")
def main(): english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, held_out_set, test_set = get_datasets(english, spanish) translations = get_word_translations("100000_trans.txt") search = BeamSearch(training_set, held_out_set, translations) print search.translate(test_set[8])
def main(): """ Creates a temporary file for the given input which is used to create a dataset, that is then evaluated on the given model. The generated summary is printed to standard out. """ args, unknown_args = prepare_arg_parser().parse_known_args() model_file = args.model_file with suppress_stdout_stderr(): model, _optimizer, vocab, _stats, cfg = train.load_model( model_file, unknown_args ) _, filename = tempfile.mkstemp() try: with open(filename, "a") as f: input_ = sys.stdin.read() article = preprocess.parse(input_) print(f"{article}\tSUMMARY_STUB", file=f) with suppress_stdout_stderr(): dataset = Dataset(filename, vocab, cfg) batch = next(dataset.generator(1, cfg.pointer)) # don't enforce any min lengths (useful for short cmdline summaries") setattr(cfg, "min_summary_length", 1) bs = BeamSearch(model, cfg=cfg) summary = evaluate.batch_to_text(bs, batch)[0] print(f"SUMMARY:\n{summary}") finally: os.remove(filename)
def translate(self, docs): """Translate a batch of documents.""" batch_size = docs.inp.size(0) spt_ids = self.spt_ids_C decode_strategy = BeamSearch(self.beam_size, batch_size, self.n_best, self.min_length, self.max_length, spt_ids, self.eos_mapping) return self._translate_batch_with_strategy(docs, decode_strategy)
def beam_search(self, initial_sequence, forbid_movies=None, temperature=1, **kwargs): """ Beam search sentence generation :param initial_sequence: list giving the initial sequence of tokens :param kwargs: additional parameters to pass to model forward pass (e.g. a conditioning context) :return: """ beam_search = BeamSearch(self.beam_size, initial_sequence, self.word2id["</s>"]) beams = beam_search.beams for i in range(self.max_sequence_length): # compute probabilities for each beam probabilities = [] for beam in beams: # add batch_dimension model_input = Variable(torch.LongTensor( beam.sequence)).unsqueeze(0) if self.model.cuda_available: model_input = model_input.cuda() beam_forbidden_movies = forbid_movies.union( beam.mentioned_movies) prob = self.model(input=model_input, lengths=[len(beam.sequence)], log_probabilities=False, forbid_movies=beam_forbidden_movies, temperature=temperature, **kwargs) # get probabilities for the next token to generate probabilities.append(prob[0, -1, :].cpu()) # update beams beams = beam_search.search(probabilities, n_gram_block=self.n_gram_block) # replace movie names with the corresponding words for beam in beams: if beam.sequence[-1] > len(self.word2id): # update the list of movies mentioned for preventing repeated recommendations beam.mentioned_movies.add(beam.sequence[-1] - len(self.word2id)) beam.sequence[-1:] = replace_movie_with_words( beam.sequence[-1], self.movieId2name, self.word2id) return beams
def __init__(self, encoder, decoder, decoding_style="greedy", special_tokens_dict=None, max_decoding_steps=128, beam_width=10): super().__init__() self.encoder = encoder self.decoder = decoder if decoding_style not in ["greedy", "beam_search"]: print(f"{decoding_style} is not allowed parameter") decoding_style = "greedy" self.decoding_style = decoding_style if special_tokens_dict is None: self.special_tokens_dict = {"<pad>":0, "<bos>":1, "<eos>":2, "<unk>":3} else: self.special_tokens_dict = special_tokens_dict self.max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self.special_tokens_dict["<eos>"], max_decoding_steps, beam_width)
def main(): english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, test_set, translated_set = get_datasets(english, spanish) translations = get_word_translations("3000_trans.txt") search = BeamSearch(training_set, translations) test_output = open('trans_beam.txt','w') true_output = open('trans_true.txt','w') for i in range(len(test_set)): print "Translating sentence", i, "..." test_output.write(' '.join(search.translate(test_set[i])) + "\n") true_output.write(' '.join(translated_set[i]) + "\n") test_output.close() true_output.close()
def main(): english = tokenize("data/100ktok.low.en") spanish = tokenize("data/100ktok.low.es") training_set, test_set, translated_set = get_datasets(english, spanish) translations = get_word_translations("3000_trans.txt") print "Original Sentence:", ' '.join(test_set[0]) translator = DirectTrans(translations) print "Direct Translation:", ' '.join(translator.translate(test_set[0])) test_output = open('trans_beam.txt','w') true_output = open('trans_true.txt','w') search = BeamSearch(training_set, translations) print "Beam Translation:", ' '.join(search.translate(test_set[0])) print "True Translation:", ' '.join(translated_set[0])
def train(dataset, dataval, y_max_length, steps_per_epoch, vocab, params): print("training") print(params.__dict__) # seq2seq = Seq2seq_attention(len(vocab), params) seq2seq = Seq2seq_attention(len(vocab), params, embedding_matrix=fasttext_embedding( params, sentences=None)) if params.finetune: seq2seq.restore_checkpoint() seq2seq.encoder.embedding.trainable = True seq2seq.decoder.embedding.trainable = True seq2seq.decoder.fc1.trainable = True else: seq2seq.encoder.embedding.trainable = False seq2seq.decoder.embedding.trainable = False seq2seq.decoder.fc1.trainable = False # it = iter(dataval) # inp, out = next(it) beam_search = BeamSearch(seq2seq, params.beam_size, vocab.bos, vocab.eos, y_max_length) # seq2seq.compare_input_output( # inp, vocab, y_max_length, out, beam_search) seq2seq.summary() # seq2seq.encoder.summary() # seq2seq.decoder.summary() # def my_loss(truth, preds): # return sum(tf_rouge_l(preds, truth, vocab.eos)) def callback(): print("train set:") it = iter(dataset.unbatch()) for _ in range(3): inp, out = next(it) seq2seq.compare_input_output(inp, vocab, y_max_length, out, beam_search) print("validation set:") it = iter(dataval) for _ in range(3): inp, out = next(it) seq2seq.compare_input_output(inp, vocab, y_max_length, out, beam_search) # seq2seq.train_epoch(dataset, epochs, steps_per_epoch, vocab.bos, # restore_checkpoint=True, dataval=dataval, callback=None) seq2seq.train_epoch(dataset, params.epochs, steps_per_epoch, vocab.bos, y_max_length, restore_checkpoint=params.restore, dataval=dataval, epoch_verbosity=params.epoch_verbosity, callback=callback)
def generate(self, input_variable, batch_size): input_variable = input_variable.view(batch_size, -1) encoder_hidden = self.encoder.init_hidden(batch_size) encoder_outputs, encoder_hidden = self.encoder(input_variable, encoder_hidden) decoder_hidden = encoder_hidden decoder_inputs = [(Variable(torch.LongTensor([[SOS_token]])), decoder_hidden)] # SOS beam = BeamSearch(self.vocab_size2, self.beam_size, decoder_hidden) # loop beam search for di in xrange(self.target_length): decoder_outputs = [] for decoder_input, decoder_hidden in decoder_inputs: decoder_output, decoder_hidden, _ = self.decoder( decoder_input, decoder_hidden, encoder_outputs) # SOS + Predict decoder_outputs.append((decoder_output, decoder_hidden)) decoder_inputs = beam.beam_search(decoder_outputs) return beam.generate(self.generate_num)
def eva_a_phi(phi): na, nnh, nh, nw = phi # choose a dataset to train (mscoco, flickr8k, flickr30k) dataset = 'mscoco' data_dir = osp.join(DATA_ROOT, dataset) from model.ra import Model # settings mb = 64 # mini-batch size lr = 0.0002 # learning rate # nh = 512 # size of LSTM's hidden size # nnh = 512 # hidden size of attention mlp # nw = 512 # size of word embedding vector # na = 512 # size of the region features after dimensionality reduction name = 'ra' # model name, just setting it to 'ra' is ok. 'ra'='region attention' vocab_freq = 'freq5' # use the vocabulary that filtered out words whose frequences are less than 5 print '... loading data {}'.format(dataset) train_set = Reader(batch_size=mb, data_split='train', vocab_freq=vocab_freq, stage='train', data_dir=data_dir, feature_file='features_30res.h5', topic_switch='off') # change 0, 1000, 82783 valid_set = Reader(batch_size=1, data_split='val', vocab_freq=vocab_freq, stage='val', data_dir=data_dir, feature_file='features_30res.h5', caption_switch='off', topic_switch='off') # change 0, 10, 5000 npatch, nimg = train_set.features.shape[1:] nout = len(train_set.vocab) save_dir = '{}-nnh{}-nh{}-nw{}-na{}-mb{}-V{}'.\ format(dataset.lower(), nnh, nh, nw, na, mb, nout) save_dir = osp.join(SAVE_ROOT, save_dir) model_file, m = find_last_snapshot(save_dir, resume_training=False) os.system('cp model/ra.py {}/'.format(save_dir)) logger = Logger(save_dir) logger.info('... building') model = Model(name=name, nimg=nimg, nnh=nnh, nh=nh, na=na, nw=nw, nout=nout, npatch=npatch, model_file=model_file) # start training bs = BeamSearch([model], beam_size=1, num_cadidates=100, max_length=20) best = train(model, bs, train_set, valid_set, save_dir, lr, display=100, starting=m, endding=20, validation=2000, life=10, logger=logger) # change dis1,100; va 2,2000; life 0,10; average_models(best=best, L=6, model_dir=save_dir, model_name=name+'.h5') # L 1, 6 # evaluation np.save('data_dir', data_dir) np.save('save_dir', save_dir) os.system('python valid_time.py') scores = np.load('scores.npy') running_time = np.load('running_time.npy') print 'cider:', scores[-1], 'B1-4,C:', scores, 'running time:', running_time return scores, running_time
def __init__(self, model, batch_reader, model_config, data_config, vocab, data_loader): self.model = model self.batch_reader = batch_reader self.model_config = model_config self.data_config = data_config self.vocab = vocab self.data_loader = data_loader self.saver = tf.train.Saver() self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self.restore_model_flag = self.restore_model() self.bs = BeamSearch( self.model, self.model_config.beam_size, self.data_loader.word_to_id(self.data_config.sentence_start), self.data_loader.word_to_id(self.data_config.sentence_end), self.model_config.abstract_length)
def transformers_generate_txt(txt, params): """ txt- english sent """ nlp = spacy.load('en_core_web_sm') txt = contractions.fix(txt) enc_input = tf.expand_dims( tf.constant([tokenize_eng[tok.text.lower()] for tok in nlp(txt)]), 0) depth = config['transformer']['dmodel'] / config['transformer']['num_heads'] dmodel = config['transformer']['dmodel'] num_blocks = config['transformer']['num_blocks'] num_heads = config['transformer']['num_heads'] transformer = Transformer(num_blocks=num_blocks, dmodel=dmodel, depth=depth, num_heads=num_heads, inp_vocab_size=config['dataloader']['eng_vocab'], tar_vocab_size=config['dataloader']['ger_vocab']) trainer = TrainerTransformer(transformer, config) trainer.restore_checkpoint(config['transformer']['ckpt_dir']) dec_input = tf.reshape(tokenize_ger['<sos>'], (1, 1)) att = [] dec_seq_mask = unidirectional_input_mask(enc_input, dec_input) logits = transformer(enc_input, dec_input, dec_seq_mask=dec_seq_mask) # beam_search bs = BeamSearch(config['transformer']['k', 1], trainer.transformer) sents = bs.call(enc_input, logits, params.dec_max_len) output = [[detokenize_ger[idx] for idx in sent] for sent in sents] # att.append(attention_weights) return [" ".join(sent) for sent in output]
def parse_options(): parser = argparse.ArgumentParser() Train.add_parse_options(parser) Encoder.add_parse_options(parser) AttnDecoder.add_parse_options(parser) Seq2SeqModel.add_parse_options(parser) LMModel.add_parse_options(parser) BeamSearch.add_parse_options(parser) parser.add_argument("-dev", default=False, action="store_true", help="Get dev set results using the last saved model") parser.add_argument("-test", default=False, action="store_true", help="Get test results using the last saved model") args = parser.parse_args() args = vars(args) return process_args(args)
def main(): # T=5 n_51 = Node(idx=51) n_52 = Node(idx=52) # T=4 n_41 = Node(idx=41, next_nodes=[n_51]) n_42 = Node(idx=42, next_nodes=[n_51]) n_43 = Node(idx=43, next_nodes=[n_52]) # T=3 n_31 = Node(idx=31, next_nodes=[n_41, n_42, n_43]) # T=2 n_21 = Node(idx=21, next_nodes=[n_31]) n_22 = Node(idx=22, next_nodes=[n_31]) n_23 = Node(idx=23, next_nodes=[n_31]) # T=1 n_11 = Node(idx=11, next_nodes=[n_21, n_22]) n_12 = Node(idx=12) n_13 = Node(idx=13, next_nodes=[n_23]) # T=0 n_root = Node(idx=0, next_nodes=[n_11, n_12, n_13]) # Beam search n_beam = 3 l_times = 5 beam_search = BeamSearch(n_beam, l_times) seqs = beam_search.find(n_root) # Check nodes for i, seq in enumerate(seqs): print("Sequence {}".format(i)) for n in seq: print(n.idx)
def __init__(self, embedder, hidden_dim, num_layers, dropout_p, attention, beam_width): super(LstmDecoder, self).__init__() self._embedder = embedder self._hidden_dim = hidden_dim self._num_layers = num_layers self._attention = attention decoder_input_dim = self._embedder.get_embed_dim() decoder_input_dim += self._hidden_dim # for input-feeding self._lstm = StackedLSTM(decoder_input_dim, self._hidden_dim, num_layers, dropout_p) self._output_projection_layer = nn.Linear( self._hidden_dim, self._embedder.get_vocab_size()) self._start_index = self._embedder.get_init_token_idx() self._eos_index = self._embedder.get_eos_token_idx() self._pad_index = self._embedder.get_pad_token_idx() self._max_decoding_steps = 100 self._beam_search = BeamSearch(self._eos_index, self._max_decoding_steps, beam_width)
def beamsearch_hamcycle(pred, W, beam_size=2): N = W.size(-1) batch_size = W.size(0) BS = BeamSearch(beam_size, batch_size, N) trans_probs = pred.gather(1, BS.get_current_state()) for step in range(N-1): BS.advance(trans_probs, step + 1) trans_probs = pred.gather(1, BS.get_current_state()) ends = torch.zeros(batch_size, 1).type(dtype_l) # extract paths Paths = BS.get_hyp(ends) # Compute cost of path Costs = compute_cost_path(Paths, W) return Costs, Paths
def __init__(self, config, train_vocab, labels_vocab, is_train=True, use_attention=True, beam_search=None): self.use_attention = use_attention self.beam_search = None if beam_search: self.beam_search = BeamSearch(self) self.config = config self.encoder_inputs = None self.decoder_inputs = None self.grad_norm = None self.train_vocab = train_vocab self.labels_vocab = labels_vocab self.build(is_train)
def __init__(self, config, train_vocab, is_train=True, use_attention=True, beam_search=False, bidirectional=True, pointer=True): self.use_attention = use_attention self.beam_search = None if beam_search: self.beam_search = BeamSearch(self) self.pointer = pointer self.bidirectional = bidirectional self.config = config self.config.vocab_size = len(train_vocab.tok2id) + 1 self.encoder_inputs = None self.decoder_inputs = None self.grad_norm = None self.vocab = train_vocab self.build(is_train)
def generate_summaries(model, dataset, cfg, limit=math.inf, shuffle=False, pbar=None): """ Generate summaries using the given `model` on the given `dataset`. Expects the given model to be in eval mode. :param model: Use this model for evaluation :param dataset: The dataset to evaluate on :param cfg: The `Config` used for the given model from which we get info on whether it uses pointer generation or not. :param limit: Limit the pairs evaluated to this many :param shuffle: Whether to shuffle the dataset before yielding batches :param pbar: Optional pbar (tqdm) to update with progress """ batch_size = 1 # beam_search currently only supports batch_size 1 bs = BeamSearch(model, cfg=cfg) with torch.no_grad(): generator = dataset.generator(batch_size, cfg.pointer, shuffle) references = [] hypothesis = [] for idx, batch in enumerate(generator): hyps = batch_to_text(bs, batch) refs = [" ".join(e.tgt) for e in batch.examples] hypothesis.extend(hyps) references.extend(refs) if batch_size * idx >= limit: break if pbar is not None: pbar.update(batch_size) pbar.close() return (hypothesis, references)
def check_train_results(dataval, y_max_length, steps_per_epoch, vocab, params): print("check_train_results") seq2seq = Seq2seq_attention( len(vocab), params, embedding_matrix=fasttext_embedding(params, sentences=None)) it = iter(dataval) inp, out = next(it) beam_search = BeamSearch(seq2seq, 9, vocab.bos, vocab.eos, y_max_length) # seq2seq.compare_input_output(inp, vocab, y_max_length, out, beam_search) seq2seq.summary() def callback(): for _ in range(10): inp, out = next(it) seq2seq.compare_input_output( inp, vocab, y_max_length, out, beam_search) # seq2seq.train_epoch(dataset, 5, steps_per_epoch, vocab.bos, # restore_checkpoint=True, dataval=dataval, callback=None) # seq2seq.train_epoch(dataset, 5, steps_per_epoch, vocab.bos, # restore_checkpoint=True, dataval=dataval, callback=callback) seq2seq.restore_checkpoint() callback()
def _beam_search_decoding(self, imgs, beam_size): B = imgs.size(0) # use batch_size*beam_size as new Batch imgs = tile(imgs, beam_size, dim=0) enc_outs, hiddens = self.model.encode(imgs) dec_states, O_t = self.model.init_decoder(enc_outs, hiddens) new_B = imgs.size(0) # first decoding step's input tgt = torch.ones(new_B, 1).long() * START_TOKEN beam = BeamSearch(beam_size, B) for t in range(self.max_len): tgt = beam.current_predictions.unsqueeze(1) dec_states, O_t, probs = self.step_decoding( dec_states, O_t, enc_outs, tgt) log_probs = torch.log(probs) beam.advance(log_probs) any_beam_is_finished = beam.is_finished.any() if any_beam_is_finished: beam.update_finished() if beam.done: break select_indices = beam.current_origin if any_beam_is_finished: # Reorder states h, c = dec_states h = h.index_select(0, select_indices) c = c.index_select(0, select_indices) dec_states = (h, c) O_t = O_t.index_select(0, select_indices) # get results formulas_idx = torch.stack([hyps[1] for hyps in beam.hypotheses], dim=0) results = self._idx2formulas(formulas_idx) return results
def generate_beamsearch(self, src: torch.Tensor, maxlen: int, bos_index: int, pad_index: int, unk_index: int, eos_index: int, vocab_size: int, beam_size: int = 3, no_repeat_ngram_size: int = 0): # Obtain device information device = next(self.parameters()).device _, batch_size = src.shape src_key_padding_mask = (src == pad_index).T # batch_size x srclen memory = self.encode(src, src_key_padding_mask=src_key_padding_mask) # <BOS> tgt seq for generation tgt = torch.LongTensor(maxlen, batch_size, beam_size).fill_(pad_index).to(device) tgt[0, :, :] = torch.LongTensor(batch_size, beam_size).fill_(bos_index).to(device) scores = torch.zeros(batch_size, beam_size, maxlen).to(device) scores[:, :, 0] = torch.ones(batch_size, beam_size).to(device) active_beams = [0] # up to beam_size beams. search = BeamSearch(vocab_size, pad_index, unk_index, eos_index) ngram_blocking = NgramBlocking(no_repeat_ngram_size) # After eos log_probs_after_eos = torch.FloatTensor(batch_size, beam_size, self.out_vocab_size).fill_( float("-inf")).cpu() log_probs_after_eos[:, :, eos_index] = 0. best_n_indices = tgt.new_full((batch_size, len(active_beams)), bos_index) for i in range(1, maxlen): if (best_n_indices == eos_index).all( ): # if all of last prediction is eos, we can leave the loop break # Generate probability for all beams, update probability for all beams (lprobs). lprobs = torch.zeros(batch_size, len(active_beams), vocab_size).to(device) for j in range(len(active_beams)): tgt_key_padding_mask = (tgt[:i, :, active_beams[j]] == pad_index ).T # batch_size x len(tgt) tgt_mask = self.transformer.generate_square_subsequent_mask( i).to(device) decode_prob = self.decode( tgt[:i, :, active_beams[j]], memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask) pred_prob = self.linear(decode_prob) lprobs[:, j, :] = pred_prob[-1, :] # Update lprobs for n-gram blocking if no_repeat_ngram_size > 0: for batch_idx in range(batch_size): for beam_idx in range(len(active_beams)): lprobs[batch_idx, beam_idx] = ngram_blocking.update( i - 1, tgt[:i, batch_idx, beam_idx], lprobs[batch_idx, beam_idx]) expanded_indices = best_n_indices.detach().cpu().unsqueeze( -1).expand( (batch_size, len(active_beams), self.out_vocab_size)) clean_lprobs = torch.where( expanded_indices == eos_index, log_probs_after_eos[:, :len(active_beams)], F.log_softmax(lprobs.detach().cpu(), dim=-1)) # Run the beam search step and select the top-k beams. best_n_scores, best_n_indices, best_n_beams = search.step( i, clean_lprobs, scores.index_select(1, torch.tensor( active_beams, device=device)).detach().cpu(), beam_size) # Take the top results, more optimization can be done here, e.g., avoid <eos> beams. best_n_scores = best_n_scores[:, :beam_size] best_n_indices = best_n_indices[:, :beam_size] best_n_beams = best_n_beams[:, :beam_size] # update results tgt = tgt.gather( 2, best_n_beams.unsqueeze(0).expand(maxlen, batch_size, -1).to(device)) tgt[i, :, :] = best_n_indices scores[:, :, i] = best_n_scores active_beams = range(beam_size) return tgt[:, :, 0]
def generate(self, title='晚安', genre=3): if (self.model is None): raise Exception("has no model") temperature = 1 topk = 15 context_tokens = [] assert genre in [0, 1, 2, 3] text_genre_list = ['五言绝句', '七言绝句', '五言律诗', '七言律诗'] genre_code_list = ['wuyanjue', 'qiyanjue', 'wuyanlv', 'qiyanlv'] text_genre = text_genre_list[genre] genre_code = genre_code_list[genre] ids = self.title_to_ids[text_genre] context_tokens.append(ids) context_tokens.append(100) context_tokens.extend( self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(title))) context_tokens.append(4282) # 4282 is # out = None while out is None: #generator = CheckedGenerator(model=self.model, #context=context_tokens, #tokenizer=self.tokenizer, #checker=self.checker, #genre=genre_code, #temperature=temperature, #top_k=topk, device=self.device) # BaseGenerator #generator = BaseGenerator(model=self.model, #context=context_tokens, #tokenizer=self.tokenizer, #temperature=temperature, #top_k=topk, #device=self.device) #out = generator.sample_sequence() # BeamSearch generator = BeamSearch(model=self.model, context=context_tokens, tokenizer=self.tokenizer, temperature=temperature, beam_size=3, mode=2, genre=genre, top_k=topk, device=self.device) out = generator.beam_sequence() out = out.tolist() text = self.tokenizer.convert_ids_to_tokens(out[0]) text = text[:-1] text = ''.join(text) text = text.split('#')[-1] return text
logging.info('# init data') training_iter = train_batches.make_one_shot_iterator() val_iter = eval_batches.make_initializable_iterator() logging.info("# Load model") m = Transformer(hp) # get op loss, train_op, global_step, train_summaries = m.train(xs, ys) y_hat, eval_summaries = m.eval(xs, ys) token2idx, idx2token = _load_vocab(hp.vocab) bs = BeamSearch(m, hp.beam_size, list(idx2token.keys())[2], list(idx2token.keys())[3], idx2token, hp.maxlen2, m.x, m.decoder_inputs, m.logits) logging.info("# Session") saver = tf.train.Saver(max_to_keep=hp.num_epochs) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: ckpt = tf.train.latest_checkpoint(hp.logdir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.logdir, "specs")) else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.logdir, sess.graph)
def beam_search(enc_output, enc_bias, source_length): """ beam_search """ max_len = layers.fill_constant( shape=[1], dtype='int64', value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype='int64', value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) caches_batch_size = batch_size * beam_size init_score = np.zeros([1, beam_size]).astype('float32') init_score[:, 1:] = -INF initial_log_probs = layers.assign(init_score) alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1]) # alive seq [batch_size, beam_size, 1] initial_ids = layers.zeros([batch_size, 1, 1], 'float32') alive_seq = layers.expand(initial_ids, [1, beam_size, 1]) alive_seq = layers.cast(alive_seq, 'int64') enc_output = layers.unsqueeze(enc_output, axes=[1]) enc_output = layers.expand(enc_output, [1, beam_size, 1, 1]) enc_output = layers.reshape(enc_output, [caches_batch_size, -1, d_model]) tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1]) tgt_src_attn_bias = layers.expand(tgt_src_attn_bias, [1, beam_size, n_head, 1, 1]) enc_bias_shape = layers.shape(tgt_src_attn_bias) tgt_src_attn_bias = layers.reshape(tgt_src_attn_bias, [-1, enc_bias_shape[2], enc_bias_shape[3], enc_bias_shape[4]]) beam_search = BeamSearch(beam_size, batch_size, decode_alpha, trg_vocab_size, d_model) caches = [{ "k": layers.fill_constant( shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] finished_seq = layers.zeros_like(alive_seq) finished_scores = layers.fill_constant([batch_size, beam_size], dtype='float32', value=-INF) finished_flags = layers.fill_constant([batch_size, beam_size], dtype='float32', value=0) with while_op.block(): pos = layers.fill_constant([caches_batch_size, 1, 1], dtype='int64', value=1) pos = layers.elementwise_mul(pos, step_idx, axis=0) alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1]) alive_seq_2 = alive_seq_1[:, -1:] alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1]) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs=(alive_seq_2, alive_seq_2, pos, None, tgt_src_attn_bias), enc_output=enc_output, caches=caches, is_train=False, params_type=params_type) alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \ beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq, finished_scores, finished_flags, caches, enc_output, tgt_src_attn_bias) layers.increment(x=step_idx, value=1.0, in_place=True) finish_cond = beam_search.is_finished(step_idx, source_length, alive_log_probs_2, finished_scores_2, finished_flags_2) layers.assign(alive_seq_2, alive_seq) layers.assign(alive_log_probs_2, alive_log_probs) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flags) for i in xrange(len(caches_2)): layers.assign(caches_2[i]["k"], caches[i]["k"]) layers.assign(caches_2[i]["v"], caches[i]["v"]) layers.logical_and(x=cond, y=finish_cond, out=cond) finished_flags = layers.reduce_sum(finished_flags, dim=1, keep_dim=True) / beam_size finished_flags = layers.cast(finished_flags, 'bool') mask = layers.cast(layers.reduce_any(input=finished_flags, dim=1, keep_dim=True), 'float32') mask = layers.expand(mask, [1, beam_size]) mask2 = 1.0 - mask finished_seq = layers.cast(finished_seq, 'float32') alive_seq = layers.cast(alive_seq, 'float32') #print mask finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \ layers.elementwise_mul(alive_seq, mask2, axis = 0) finished_seq = layers.cast(finished_seq, 'int32') finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \ layers.elementwise_mul(alive_log_probs, mask2) finished_seq.persistable = True finished_scores.persistable = True return finished_seq, finished_scores
class Prediction: def __init__(self, args): """ :param model_dir: model dir path :param vocab_file: vocab file path """ self.tf = import_tf(0) self.args = args self.model_dir = args.logdir self.vocab_file = args.vocab self.token2idx, self.idx2token = _load_vocab(args.vocab) hparams = Hparams() parser = hparams.parser self.hp = parser.parse_args() self.model = Transformer(self.hp) self._add_placeholder() self._init_graph() def _init_graph(self): """ init graph """ self.ys = (self.input_y, None, None) self.xs = (self.input_x, None) self.memory = self.model.encode(self.xs, False)[0] self.logits = self.model.decode(self.xs, self.ys, self.memory, False)[0] ckpt = self.tf.train.get_checkpoint_state(self.model_dir).all_model_checkpoint_paths[-1] graph = self.logits.graph sess_config = self.tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True saver = self.tf.train.Saver() self.sess = self.tf.Session(config=sess_config, graph=graph) self.sess.run(self.tf.global_variables_initializer()) self.tf.reset_default_graph() saver.restore(self.sess, ckpt) self.bs = BeamSearch(self.model, self.hp.beam_size, list(self.idx2token.keys())[2], list(self.idx2token.keys())[3], self.idx2token, self.hp.maxlen2, self.input_x, self.input_y, self.logits) def predict(self, content): """ abstract prediction by beam search :param content: article content :return: prediction result """ input_x = content.split() while len(input_x) < self.args.maxlen1: input_x.append('<pad>') input_x = input_x[:self.args.maxlen1] input_x = [self.token2idx.get(s, self.token2idx['<unk>']) for s in input_x] memory = self.sess.run(self.memory, feed_dict={self.input_x: [input_x]}) return self.bs.search(self.sess, input_x, memory[0]) def _add_placeholder(self): """ add tensorflow placeholder """ self.input_x = self.tf.placeholder(dtype=self.tf.int32, shape=[None, self.args.maxlen1], name='input_x') self.input_y = self.tf.placeholder(dtype=self.tf.int32, shape=[None, None], name='input_y')
from model.gnic import Model model_list = ['mscoco-gnic-nh512-nw512-mb64-V8843/gnic.h5.merge'] models = [Model(model_file=osp.join(SAVE_ROOT, m)) for m in model_list] valid_set = Reader(batch_size=1, data_split='test', vocab_freq='freq5', stage='test', data_dir=data_dir, feature_file='features_1res.h5', caption_switch='off', topic_switch='off', head=0, tail=1000) bs = BeamSearch(models, beam_size=3, num_cadidates=500, max_length=20) scores = validate(bs, valid_set) if task == 'ss': from model.ss import Model model_list = ['mscoco-nh512-nw512-mb64-V8843/ss.h5.merge'] models = [Model(model_file=osp.join(SAVE_ROOT, m)) for m in model_list] valid_set = Reader(batch_size=1, data_split='test', vocab_freq='freq5', stage='val', data_dir=data_dir, feature_file='features_1res.h5', topic_type='pred', topic_file='lda_topics.h5',
def __init__(self): self.rev_vocab = self.load_char_vocab() self.beam_search = BeamSearch(self.ckpt_file, self.rev_vocab)