def get_article_inputs(self, article): pad_id = self._vocab.WordToId(data.PAD_TOKEN) article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] enc_inputs = [] # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(self._max_article_sentences, len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps enc_input_len = len(enc_inputs) # Pad if necessary while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) enc_batch = np.zeros((self._hps.batch_size, self._hps.enc_timesteps), dtype=np.int32) enc_input_lens = np.zeros((self._hps.batch_size), dtype=np.int32) for i in xrange(self._hps.batch_size): enc_input_lens[i] = enc_input_len enc_batch[i, :] = enc_inputs[:] return (enc_batch, enc_input_lens)
def _convertInputToModelTensor(self, article): print(article) article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] print(article_sentences) enc_inputs = [] start_id = self._vocab.WordToId(data.SENTENCE_START) end_id = self._vocab.WordToId(data.SENTENCE_END) pad_id = self._vocab.WordToId(data.PAD_TOKEN) # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(FLAGS.max_article_sentences, len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) if (len(enc_inputs) > self._hps.enc_timesteps): tf.logging.warning('Truncating the example - too long.\nenc:%d\n', len(enc_inputs)) if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] # Now len(enc_inputs) should be <= enc_timesteps, and enc_input_len = len(enc_inputs) # Pad if necessary while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) print(enc_inputs) #return ([0]*self._hps.enc_timesteps, 50, " ") return (enc_inputs, enc_input_len, ' '.join(article_sentences))
def _Decode(self, article_text): """Restore a checkpoint and decode it. Args: saver: Tensorflow checkpoint saver. sess: Tensorflow session. Returns: If success, returns true, otherwise, false. """ bs = beam_search.BeamSearch( self._model, self._hps.batch_size, self._vocab.WordToId(data.SENTENCE_START), self._vocab.WordToId(data.SENTENCE_END), self._hps.dec_timesteps) ################### #article_text = "How do I know the difference, between class and object" article = "<d><p><s>"+article_text+"</s></p></d>" article_sentences = [sent.strip() for sent in data.ToSentences(article, include_token=False)] pad_id = self._vocab.WordToId(data.PAD_TOKEN) enc_inputs = [] for i in xrange(min(100,len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) enc_input_len = len(enc_inputs) while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) ################### w, h = 120, 4 article_batch_cp = [[0 for x in range(w)] for y in range(h)] for i in range(0,4): article_batch_cp[i] = enc_inputs#article_batch[i] w, h = 1, 4 article_lens_cp = [[0 for x in range(w)] for y in range(h)] #article_lens_cp = article_lens.copy() for i in range(0,4): article_lens_cp[i] = enc_input_len best_beam = bs.BeamSearch(self._sess, article_batch_cp, article_lens_cp) #print len(best_beam) best_beam = best_beam[0] decode_output = [int(t) for t in best_beam.tokens[1:]] QUESTION = article_text test = ' '.join(data.Ids2Words(decode_output, self._vocab)) end_p = test.find(data.SENTENCE_END, 0) if end_p != -1: test = test[:end_p] #print "<Answer>"+test ANSWER = test.replace('<UNK>','') return QUESTION, ANSWER
def _FillInputQueue(self): """Fill input queue with ModelInput.""" start_id = self._vocab.WordToId(data.SENTENCE_START) end_id = self._vocab.WordToId(data.SENTENCE_END) pad_id = self._vocab.WordToId(data.PAD_TOKEN) input_gen = self._TextGenerator(data.ExampleGen(self._data_path)) while True: (article, abstract) = input_gen.next() article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] abstract_sentences = [ sent.strip() for sent in data.ToSentences(abstract, include_token=False) ] enc_inputs = [] # Use the <s> as the <GO> symbol for decoder inputs. dec_inputs = [start_id] # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(self._max_article_sentences, len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) for i in xrange( min(self._max_abstract_sentences, len(abstract_sentences))): dec_inputs += data.GetWordIds(abstract_sentences[i], self._vocab) # Filter out too-short input if (len(enc_inputs) < self._hps.min_input_len or len(dec_inputs) < self._hps.min_input_len): tf.logging.warning( 'Drop an example - too short.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we're not truncating input, throw out too-long input if not self._truncate_input: if (len(enc_inputs) > self._hps.enc_timesteps or len(dec_inputs) > self._hps.dec_timesteps): tf.logging.warning( 'Drop an example - too long.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] if len(dec_inputs) > self._hps.dec_timesteps: dec_inputs = dec_inputs[:self._hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end targets = dec_inputs[1:] targets.append(end_id) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps enc_input_len = len(enc_inputs) dec_output_len = len(targets) # Pad if necessary while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) while len(dec_inputs) < self._hps.dec_timesteps: dec_inputs.append(end_id) while len(targets) < self._hps.dec_timesteps: targets.append(end_id) element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len, dec_output_len, ' '.join(article_sentences), ' '.join(abstract_sentences)) self._input_queue.put(element)
def _FillInputQueue(self): """Fill input queue with ModelInput. SENTENCE_START = '<s>' SENTENCE_END = '</s>' UNKNOWN_TOKEN = '<UNK>' PAD_TOKEN = '<PAD>' """ start_id = self._vocab.WordToId(data.SENTENCE_START) end_id = self._vocab.WordToId(data.SENTENCE_END) pad_id = self._vocab.WordToId(data.PAD_TOKEN) input_gen = self._TextGenerator(data.ExampleGen(self._data_path)) while True: (article, abstract) = six.next(input_gen) #得到一个个句子,每个句子开头以<s>开始,以</s>结束,当include_token为False时,将开始和结尾的<s>,</s>去掉了 article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] abstract_sentences = [ sent.strip() for sent in data.ToSentences(abstract, include_token=False) ] enc_inputs = [] # Use the <s> as the <GO> symbol for decoder inputs. #在解码模块的输入最前方加上<s> dec_inputs = [start_id] # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(self._max_article_sentences, len(article_sentences))): #将一句话变为一个向量 enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) for i in xrange( min(self._max_abstract_sentences, len(abstract_sentences))): dec_inputs += data.GetWordIds(abstract_sentences[i], self._vocab) # Filter out too-short input #句子长度太短 if (len(enc_inputs) < self._hps.min_input_len or len(dec_inputs) < self._hps.min_input_len): tf.logging.warning( 'Drop an example - too short.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue #句子太长 if not self._truncate_input: if (len(enc_inputs) > self._hps.enc_timesteps or len(dec_inputs) > self._hps.dec_timesteps): tf.logging.warning( 'Drop an example - too long.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] if len(dec_inputs) > self._hps.dec_timesteps: dec_inputs = dec_inputs[:self._hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end #解码阶段的输入是dec_inputs,以<s>开始,目标targets以</s结束> targets = dec_inputs[1:] targets.append(end_id) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps enc_input_len = len(enc_inputs) dec_output_len = len(targets) # 如果比指定长度短,在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>] while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) #<PAD>,enc_inputs不包含<s>,</s> while len(dec_inputs) < self._hps.dec_timesteps: dec_inputs.append(end_id) while len(targets) < self._hps.dec_timesteps: targets.append(end_id) #将nametupe放入队列之中 #参数:enc_inputs是编码阶段的输入,dec_inputs是解码阶段的输入,targets是解码的输出目标 element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len, dec_output_len, ' '.join(article_sentences), ' '.join(abstract_sentences)) self._input_queue.put(element)