def read_data_sets(train_dir, vocab, hps): start_id = vocab.WordToId(data.SENTENCE_START) end_id = vocab.WordToId(data.SENTENCE_END) pad_id = vocab.WordToId(data.PAD_TOKEN) articles_abstracts = data.getArticlesAndAbstracts(train_dir) enc_inputs = np.zeros((len(articles_abstracts), hps.enc_timesteps), dtype=np.int32) dec_inputs = np.zeros((len(articles_abstracts), hps.dec_timesteps), dtype=np.int32) targets = np.zeros((len(articles_abstracts), hps.dec_timesteps), dtype=np.int32) origin_articles = [] origin_abstract = [] for index, (article, abstract) in enumerate(articles_abstracts): # Use the <s> as the <GO> symbol for decoder inputs. enc_input = [] dec_input = [start_id] enc_input += data.GetWordIds(article, vocab) dec_input += data.GetWordIds(abstract, vocab) enc_input[:] = data.GetWordIds(article, vocab) dec_input[1:] = data.GetWordIds(abstract, vocab) enc_input = enc_input[:hps.enc_timesteps] dec_input = dec_input[:hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end target = dec_input[1:] target.append(end_id) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps #enc_input_len = len(enc_inputs) #dec_output_len = len(targets) # Pad if necessary while len(enc_input) < hps.enc_timesteps: enc_input.append(pad_id) while len(dec_input) < hps.dec_timesteps: dec_input.append(end_id) while len(target) < hps.dec_timesteps: target.append(end_id) enc_inputs[index] = enc_input dec_inputs[index] = dec_input targets[index] = target origin_articles.append(article) origin_abstract.append(abstract) return DataSet(enc_inputs, dec_inputs, targets, origin_articles, origin_abstract)
def _extract_we_binary(output_file, vocab_file, we_dic): vocab = data.Vocab(vocab_file, 1000000) vsize = vocab.NumIds() output = codecs.open(output_file, "w", "utf-8") unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)] with open(we_dic, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size print "layer1_size:", layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, vocab) if idx != None and idx != unknown_ids and word == "<s>": print idx, ":", word output.write(word + ' ' + ' '.join( map(str, np.fromstring(f.read(binary_len), dtype='float32'))) + '\n') elif idx == unknown_ids: f.read(binary_len) else: f.read(binary_len) f.close() output.close()
def _loadWord2VecGo(self): vsize = self._vocab.NumIds() emb_dim = self._hps.emb_dim print "vsize:", vsize print "emb_dim", emb_dim if FLAGS.word2vec: # initial matrix with random uniform initWE = np.random.uniform(-0.25, 0.25, (vsize, emb_dim)).astype(np.float32) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size print "start to read" for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, self._vocab) if idx != None: initWE[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) f.close() print "initWe loaded:", initWE return initWE
def _loadWord2Vec(self): vsize = self._vocab.NumIds() emb_dim = self._hps.emb_dim print "vsize:", vsize print "emb_dim", emb_dim if FLAGS.word2vec: # initial matrix with random uniform initWE = np.random.uniform(-0.25, 0.25, (vsize, emb_dim)).astype(np.float32) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) f = codecs.open(FLAGS.word2vec, "r") for line in f: string = line.split(" ") word = string[0] value = " ".join(x for x in string[1:]) idx = data.GetWordIds(word, self._vocab) if idx != None: initWE[idx] = np.fromstring(value, dtype='float32', sep=' ') f.close() print "initWe loaded:", initWE return initWE
def _Decode(self, article_text): """Restore a checkpoint and decode it. Args: saver: Tensorflow checkpoint saver. sess: Tensorflow session. Returns: If success, returns true, otherwise, false. """ bs = beam_search.BeamSearch( self._model, self._hps.batch_size, self._vocab.WordToId(data.SENTENCE_START), self._vocab.WordToId(data.SENTENCE_END), self._hps.dec_timesteps) ################### #article_text = "How do I know the difference, between class and object" article = "<d><p><s>"+article_text+"</s></p></d>" article_sentences = [sent.strip() for sent in data.ToSentences(article, include_token=False)] pad_id = self._vocab.WordToId(data.PAD_TOKEN) enc_inputs = [] for i in xrange(min(100,len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) enc_input_len = len(enc_inputs) while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) ################### w, h = 120, 4 article_batch_cp = [[0 for x in range(w)] for y in range(h)] for i in range(0,4): article_batch_cp[i] = enc_inputs#article_batch[i] w, h = 1, 4 article_lens_cp = [[0 for x in range(w)] for y in range(h)] #article_lens_cp = article_lens.copy() for i in range(0,4): article_lens_cp[i] = enc_input_len best_beam = bs.BeamSearch(self._sess, article_batch_cp, article_lens_cp) #print len(best_beam) best_beam = best_beam[0] decode_output = [int(t) for t in best_beam.tokens[1:]] QUESTION = article_text test = ' '.join(data.Ids2Words(decode_output, self._vocab)) end_p = test.find(data.SENTENCE_END, 0) if end_p != -1: test = test[:end_p] #print "<Answer>"+test ANSWER = test.replace('<UNK>','') return QUESTION, ANSWER
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens sess = tf.get_default_session() print sess with tf.variable_scope('Embedding'), tf.device('/gpu:0'): # Embedding shared by the input and outputs. #embedding = tf.get_variable( # 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, # trainable=False, # initializer=tf.truncated_normal_initializer(stddev=1e-4)) embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, trainable=False, initializer=tf.truncated_normal_initializer(stddev=1e-4)) sess.run(tf.initialize_all_variables()) if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (vsize, hps.emb_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, self._vocab) if idx != None: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(embedding.assign(initW)) encoder_inputs = [2, 4, 6, 8] emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] print emb_encoder_inputs
def fill_input_quest(self, quest): start_id = myServer.batcher._vocab.WordToId(data.SENTENCE_START) end_id = myServer.batcher._vocab.WordToId(data.SENTENCE_END) pad_id = myServer.batcher._vocab.WordToId(data.PAD_TOKEN) quest = ' '.join(self.get_words(quest)) article_sentences = quest.strip() abstract_sentences = article_sentences enc_inputs = [] # Use the <s> as the <GO> symbol for decoder inputs. dec_inputs = [start_id] enc_inputs += data.GetWordIds(article_sentences, myServer.batcher._vocab) dec_inputs += data.GetWordIds(abstract_sentences, myServer.batcher._vocab) #句子太长 if len(enc_inputs) > myServer.batcher._hps.enc_timesteps: enc_inputs = enc_inputs[:myServer.batcher._hps.enc_timesteps] if len(dec_inputs) > myServer.batcher._hps.dec_timesteps: dec_inputs = dec_inputs[:myServer.batcher._hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end #<s>之前额外加上了,此处额外加上</s> targets = dec_inputs[1:] targets.append(end_id) enc_input_len = len(enc_inputs) dec_output_len = len(targets) # 如果比指定长度短,在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>] while len(enc_inputs) < myServer.batcher._hps.enc_timesteps: enc_inputs.append(pad_id) while len(dec_inputs) < myServer.batcher._hps.dec_timesteps: dec_inputs.append(end_id) while len(targets) < myServer.batcher._hps.dec_timesteps: targets.append(end_id) #将nametupe放入队列之中 element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len, dec_output_len, article_sentences, abstract_sentences) return element
def _FillInputQueue(self): """逐行填充输入队列""" pad_id = self._vocab.WordToId(parameter_config.PAD_TOKEN) if self._hps.mode == 'train': input_gen = self._TextGenerator( data.ExampleGen(os.path.join(self._data_path, '*'))) else: input_gen = self._TextGenerator( data.ExampleGen(os.path.join(self._data_path, '*'), 1)) while True: try: (index_id, target, sentence) = input_gen.next() except (GeneratorExit, StopIteration): break enc_inputs = data.GetWordIds(sentence.strip(), self._vocab) target = int(target) # Filter out too-short input if (len(enc_inputs) < self._hps.min_input_len): # tf.logging.warning('Drop an example - too short.\nenc:%d\ndec:%d', # len(enc_inputs), len(dec_inputs)) continue # If we're not truncating input, throw out too-long input if not self._truncate_input: if (len(enc_inputs) > self._hps.enc_timesteps): # tf.logging.warning('Drop an example - too long.\nenc:%d\ndec:%d', # len(enc_inputs), len(dec_inputs)) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] enc_input_len = len(enc_inputs) # Pad if necessary while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) element = ModelInput(index_id, target, enc_inputs, enc_input_len) self._input_queue.put(element)
def _extract_we_text(output_file, vocab_file, we_dic): vocab = data.Vocab(vocab_file, 1000000) vsize = vocab.NumIds() m = copy.deepcopy(vocab._word_to_id) unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)] output = codecs.open(output_file, "w", "utf-8") with open(we_dic, "rb") as f: for line in f: string = line.split(" ") word = string[0].strip() value = " ".join(x for x in string[1:]) idx = data.GetWordIds(word, vocab) if idx != None and idx != unknown_ids and word in m: del m[word] output.write(word + ' ' + value) print "====:", m print "---:", len(m) f.close() output.close() #this operation wants to garuantee that words in WE and words in vocab file must be the same del m['<s>'] del m['</s>'] del m['<d>'] del m['</d>'] del m['<p>'] del m['</p>'] tt = m.keys() vocab_new = vocab_file + "_new" with open(vocab_file, 'r') as f: with open(vocab_new, 'w') as g: for line in f.readlines(): if all(string not in line for string in tt): g.write(line) if '<UNK>' in m: g.write('<UNK> 0\n') if '<PAD>' in m: g.write('<PAD> 0\n') shutil.move(vocab_new, vocab_file) f.close() g.close()
def _loadWord2VecGo(self, emb_dim): sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) vsize = self._vocab.NumIds() with tf.variable_scope('goEmbedding'), tf.device('/gpu:0'): embedding = tf.get_variable( 'embedding', [vsize, emb_dim], dtype=tf.float32, trainable=False, initializer=tf.truncated_normal_initializer(stddev=1e-4)) sess.run(tf.initialize_all_variables()) if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (vsize, emb_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, self._vocab) if idx != None: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) print "embedding first loaded:", embedding print(sess.run(tf.nn.embedding_lookup(embedding, 2))) sess.run(embedding.assign(initW)) print "function loaded:", embedding print(sess.run(tf.nn.embedding_lookup(embedding, 2))) return sess
def _loadWord2Vec(self, embedding, emb_dim): sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) vsize = self._vocab.NumIds() sess.run(tf.initialize_all_variables()) if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (vsize, emb_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) f = codecs.open(FLAGS.word2vec, "r") for line in f: str = line.split(" ") word = str[0] value = " ".join(x for x in str[1:]) idx = data.GetWordIds(word, self._vocab) if idx != None: initW[idx] = np.fromstring(value, dtype='float32', sep=' ') f.close() sess.run(embedding.assign(initW)) sess.run(embedding)
def _add_seq2seq_old(self, sess): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens with tf.variable_scope('Embedding'), tf.device('/gpu:0'): #============================================================================== # Embedding shared by the input and outputs. #embedding = tf.get_variable( # 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, # trainable=False, # initializer=tf.truncated_normal_initializer(stddev=1e-4)) #sess.run(tf.initialize_all_variables()) #============================================================================== vsize = self._vocab.NumIds() embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, trainable=False, initializer=tf.truncated_normal_initializer(stddev=1e-4)) sess.run(tf.initialize_all_variables()) if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (vsize, hps.emb_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, self._vocab) if idx != None: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) print "to test ... .. . . embedding first loaded:" print(sess.run(tf.nn.embedding_lookup(embedding, 2))) sess.run(embedding.assign(initW)) print "to test ... .. .. . function loaded:" print(sess.run(tf.nn.embedding_lookup(embedding, 2))) #=============================================================================== # Embedding shared by the input and outputs. emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] #matrix factorization ## s,u,v=tf.svd(emb_encoder_inputs,compute_uv=True) ## eigenSum=tf.reduce_sum(s) ## eigen=0 ## threshold=0 ## i=0; ## for i in range(len(s)): ## eigen=s(i) ## if((eigen/eigenSum)>threshold) ## break; #rebuild eigenvector with i length ## new_eigenMatrix = tf.Variable(tf.zeros([i,i])) ## for j in range(i): ## new_eigenMatrix[j,j]=s(j) #decrease embedding dim [vsize,64] ##emb_encoder_inputs=tf.batch_matmul(u[,:j],new_eigenMatrix) # new_embedding=u*s #or decrease word length [N,128] # new_embedding=v*s for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): #bidirectional rnn cell cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell_fw = tf.nn.rnn_cell.DropoutWrapper( cell_fw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) cell_bw = tf.nn.rnn_cell.DropoutWrapper( cell_bw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) (emb_encoder_inputs, fw_state, _) = tf.nn.bidirectional_rnn(cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs print "fw_state:", fw_state with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(1, encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) print "====emb_decoder_inputs:", emb_decoder_inputs print "====self._dec_in_state:", self._dec_in_state print "====self._enc_top_states:", self._enc_top_states print "====decoder_outputs:", decoder_outputs print "====self._dec_out_state:", self._dec_out_state with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/gpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(1, [ tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/gpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) tf.logging.info('num_sampled%s', hps.num_softmax_samples) return tf.nn.sampled_softmax_loss( w_t, v, inputs, labels, hps.num_softmax_samples, vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.nn.seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
def _FillInputQueue(self): """Fill input queue with ModelInput.""" start_id = self._vocab.WordToId(data.SENTENCE_START) end_id = self._vocab.WordToId(data.SENTENCE_END) pad_id = self._vocab.WordToId(data.PAD_TOKEN) input_gen = self._TextGenerator(data.ExampleGen(self._data_path)) while True: (article, abstract) = input_gen.next() article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] abstract_sentences = [ sent.strip() for sent in data.ToSentences(abstract, include_token=False) ] enc_inputs = [] # Use the <s> as the <GO> symbol for decoder inputs. dec_inputs = [start_id] # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(self._max_article_sentences, len(article_sentences))): enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) for i in xrange( min(self._max_abstract_sentences, len(abstract_sentences))): dec_inputs += data.GetWordIds(abstract_sentences[i], self._vocab) # Filter out too-short input if (len(enc_inputs) < self._hps.min_input_len or len(dec_inputs) < self._hps.min_input_len): tf.logging.warning( 'Drop an example - too short.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we're not truncating input, throw out too-long input if not self._truncate_input: if (len(enc_inputs) > self._hps.enc_timesteps or len(dec_inputs) > self._hps.dec_timesteps): tf.logging.warning( 'Drop an example - too long.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] if len(dec_inputs) > self._hps.dec_timesteps: dec_inputs = dec_inputs[:self._hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end targets = dec_inputs[1:] targets.append(end_id) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps enc_input_len = len(enc_inputs) dec_output_len = len(targets) # Pad if necessary while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) while len(dec_inputs) < self._hps.dec_timesteps: dec_inputs.append(end_id) while len(targets) < self._hps.dec_timesteps: targets.append(end_id) element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len, dec_output_len, ' '.join(article_sentences), ' '.join(abstract_sentences)) self._input_queue.put(element)
def _FillInputQueue(self): """Fills input queue with ModelInput.""" # input gets padded pad_id = self._input_vocab.WordToId(data.PAD_TOKEN) # output get start id and padded with end ids end_id = self._output_vocab.WordToId(data.SENTENCE_END) input_gen = self._TextGenerator(data.ExampleGen(self._data_path)) while True: (source, targets) = next(input_gen) # target = choice(targets) target = targets[0] # Convert sentences to word IDs, stripping existing <s> and </s>. enc_inputs = data.GetWordIds(source, self._input_vocab) dec_inputs_gen = data.GetWordIds(target, self._output_vocab) dec_inputs_cop = data.GetWordIndices(target, source, self._input_vocab, position_based_indexing=True) # Filter out too-short input if len(enc_inputs) < self._config.min_input_len: tf.logging.warning( 'Drop an example - input to short: %d (min: %d)', len(enc_inputs), self._config.min_input_len) continue if len(dec_inputs_gen) < self._config.min_input_len: tf.logging.warning( 'Drop an example - output to short: %d (min: %d)', len(enc_inputs), self._config.min_input_len) continue # If we're not truncating input, throw out too-long input if not self._truncate_input: if len(enc_inputs) > self._config.max_input_len: tf.logging.warning( 'Drop an example - input to long: %d (max: %d)', len(enc_inputs), self._config.max_input_len) continue if len(dec_inputs_gen) > self._config.max_output_len: tf.logging.warning( 'Drop an example - output to long: %d (max: %d)', len(dec_inputs_gen), self._config.max_output_len) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._config.max_input_len: enc_inputs = enc_inputs[:self._config.max_input_len] dec_inputs_cop = [ pos if pos <= self._config.max_input_len else 0 for pos in dec_inputs_cop ] if len(dec_inputs_gen) > self._config.max_output_len: dec_inputs_gen = dec_inputs_gen[:self._config. max_output_len] dec_inputs_cop = dec_inputs_cop[:self._config. max_output_len] # dec_targets_gen is dec_inputs without <s> at beginning, plus </s> at end dec_targets_gen = dec_inputs_gen[1:] dec_targets_gen.append(end_id) # dec_targets_gen is dec_inputs without <s> at beginning, plus </s> at end dec_targets_cop = dec_inputs_cop[1:] end_position = len(enc_inputs) dec_targets_cop.append(end_position) enc_input_len = len(enc_inputs) dec_output_len = len( dec_targets_gen) # is equal to len(dec_targets_cop) # Pad if necessary while len(enc_inputs) < self._config.max_input_len: enc_inputs.append(pad_id) while len(dec_inputs_gen) < self._config.max_output_len: dec_inputs_gen.append(end_id) while len(dec_targets_gen) < self._config.max_output_len: dec_targets_gen.append(end_id) while len(dec_targets_cop) < self._config.max_output_len: dec_targets_cop.append(end_position) element = ModelInput(enc_inputs, dec_inputs_gen, dec_targets_gen, dec_targets_cop, enc_input_len, dec_output_len, source, targets) self._input_queue.put(element)
def _FillInputQueue(self): """Fill input queue with ModelInput. SENTENCE_START = '<s>' SENTENCE_END = '</s>' UNKNOWN_TOKEN = '<UNK>' PAD_TOKEN = '<PAD>' """ start_id = self._vocab.WordToId(data.SENTENCE_START) end_id = self._vocab.WordToId(data.SENTENCE_END) pad_id = self._vocab.WordToId(data.PAD_TOKEN) input_gen = self._TextGenerator(data.ExampleGen(self._data_path)) while True: (article, abstract) = six.next(input_gen) #得到一个个句子,每个句子开头以<s>开始,以</s>结束,当include_token为False时,将开始和结尾的<s>,</s>去掉了 article_sentences = [ sent.strip() for sent in data.ToSentences(article, include_token=False) ] abstract_sentences = [ sent.strip() for sent in data.ToSentences(abstract, include_token=False) ] enc_inputs = [] # Use the <s> as the <GO> symbol for decoder inputs. #在解码模块的输入最前方加上<s> dec_inputs = [start_id] # Convert first N sentences to word IDs, stripping existing <s> and </s>. for i in xrange( min(self._max_article_sentences, len(article_sentences))): #将一句话变为一个向量 enc_inputs += data.GetWordIds(article_sentences[i], self._vocab) for i in xrange( min(self._max_abstract_sentences, len(abstract_sentences))): dec_inputs += data.GetWordIds(abstract_sentences[i], self._vocab) # Filter out too-short input #句子长度太短 if (len(enc_inputs) < self._hps.min_input_len or len(dec_inputs) < self._hps.min_input_len): tf.logging.warning( 'Drop an example - too short.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue #句子太长 if not self._truncate_input: if (len(enc_inputs) > self._hps.enc_timesteps or len(dec_inputs) > self._hps.dec_timesteps): tf.logging.warning( 'Drop an example - too long.\nenc:%d\ndec:%d', len(enc_inputs), len(dec_inputs)) continue # If we are truncating input, do so if necessary else: if len(enc_inputs) > self._hps.enc_timesteps: enc_inputs = enc_inputs[:self._hps.enc_timesteps] if len(dec_inputs) > self._hps.dec_timesteps: dec_inputs = dec_inputs[:self._hps.dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end #解码阶段的输入是dec_inputs,以<s>开始,目标targets以</s结束> targets = dec_inputs[1:] targets.append(end_id) # Now len(enc_inputs) should be <= enc_timesteps, and # len(targets) = len(dec_inputs) should be <= dec_timesteps enc_input_len = len(enc_inputs) dec_output_len = len(targets) # 如果比指定长度短,在此处填充,dec_inputs是[<s>,...],targets是[...,<\s>] while len(enc_inputs) < self._hps.enc_timesteps: enc_inputs.append(pad_id) #<PAD>,enc_inputs不包含<s>,</s> while len(dec_inputs) < self._hps.dec_timesteps: dec_inputs.append(end_id) while len(targets) < self._hps.dec_timesteps: targets.append(end_id) #将nametupe放入队列之中 #参数:enc_inputs是编码阶段的输入,dec_inputs是解码阶段的输入,targets是解码的输出目标 element = ModelInput(enc_inputs, dec_inputs, targets, enc_input_len, dec_output_len, ' '.join(article_sentences), ' '.join(abstract_sentences)) self._input_queue.put(element)