def get_q(self, context, ans, ans_pos): # Process and create a batch of 1 ctxt_feats = preprocessing.process_squad_context( self.model.vocab, context_as_set=FLAGS.context_as_set)(context) ans_feats = preprocessing.process_squad_answer( self.model.vocab, context_as_set=FLAGS.context_as_set)(ans, ans_pos, context) ctxt_feats[0] = np.array(ctxt_feats[0], dtype=bytes) ans_feats[0] = np.array(ans_feats[0], dtype=bytes) ctxt_dict = tuple([np.asarray([x]) for x in ctxt_feats]) ans_dict = tuple([np.asarray([x]) for x in ans_feats]) q, q_len = self.sess.run( [self.model.q_hat_beam_string, self.model.q_hat_beam_lens], feed_dict={ self.model.context_in: ctxt_dict, self.model.answer_in: ans_dict }) q_str = " ".join([ w.decode().replace('>', '>').replace('<', '<') for w in q[0][:q_len[0] - 1] ]) return q_str
def get_q_batch(self, contexts, answers, ans_pos): # Process and create a batch ctxt_feats = [[x for x in preprocessing.process_squad_context(self.model.vocab, context_as_set=FLAGS.context_as_set)(contexts[i])] for i in range(len(contexts))] ans_feats = [[x for x in preprocessing.process_squad_answer(self.model.vocab, context_as_set=FLAGS.context_as_set)(answers[i], ans_pos[i], contexts[i])] for i in range(len(contexts))] # Now zip to get batches of features, not batches of examples ctxt_feats = list(zip(*ctxt_feats)) ans_feats = list(zip(*ans_feats)) # pad ctxt_pad = ['<PAD>', 0, 0, 0, 0] ans_pad = ['<PAD>', 0, 0, 0] for i in range(len(ctxt_feats)): if i in [3,4]: # skip length feature continue max_len = max(len(feat) for feat in ctxt_feats[i]) ctxt_feats[i] = [list(feat) + [ctxt_pad[i] for j in range(max_len - len(feat))] for feat in ctxt_feats[i]] for i in range(len(ans_feats)): if i in [2]: # skip length feature continue max_len = max(len(feat) for feat in ans_feats[i]) ans_feats[i] = [list(feat) + [ans_pad[i] for j in range(max_len - len(feat))] for feat in ans_feats[i]] # Needed to handle weird unicode stuff properly ctxt_feats[0] = np.array(ctxt_feats[0], dtype=bytes) ans_feats[0] = np.array(ans_feats[0], dtype=bytes) ctxt_dict = tuple([np.array(x) for i, x in enumerate(ctxt_feats)]) ans_dict = tuple([np.array(x) for i, x in enumerate(ans_feats)]) # ctxt_dict = ctxt_feats # ans_dict = ans_feats qs, q_lens = self.sess.run([self.model.q_hat_beam_string,self.model.q_hat_beam_lens], feed_dict={self.model.context_in: ctxt_dict, self.model.answer_in: ans_dict}) q_str = [" ".join([w.decode().replace('>','>').replace('<','<') for w in qs[i][:q_lens[i]-1]]) for i in range(len(qs))] return q_str
def build_data_pipeline(self, batch_size): with tf.device('/cpu:*'): self.context_ph = tf.placeholder(tf.string, [None]) self.qs_ph = tf.placeholder(tf.string, [None]) self.as_ph = tf.placeholder(tf.string, [None]) self.a_pos_ph = tf.placeholder(tf.int32, [None]) self.ix = tf.placeholder(tf.int32, [None]) dataset = tf.data.Dataset.from_tensor_slices( (self.context_ph, self.qs_ph, self.as_ph, self.a_pos_ph, self.ix)) if self.shuffle: dataset = dataset.shuffle(buffer_size=100000) # processing pipeline dataset = dataset.map(lambda context, q, a, a_pos, ix: ( tuple( tf.py_func( preprocessing.process_squad_context( self.vocab, context_as_set=FLAGS.context_as_set), [context], [tf.string, tf.int32, tf.int32, tf.int32, tf.int32])), tuple( tf.py_func( preprocessing.process_squad_question( self.vocab, max_copy_size=FLAGS.max_copy_size, context_as_set=FLAGS.context_as_set, copy_priority=FLAGS.copy_priority, smart_copy=FLAGS.smart_copy, latent_switch=FLAGS.latent_switch), [ q, context, a_pos ], [tf.string, tf.int32, tf.float32, tf.int32])), tuple( tf.py_func( preprocessing.process_squad_answer( self.vocab, context_as_set=FLAGS.context_as_set), [ a, a_pos, context ], [tf.string, tf.int32, tf.int32, tf.int32])), ix # q,a )) # pad out to batches dataset = dataset.padded_batch( batch_size, padded_shapes=( ( tf.TensorShape([None ]), # source vectors of unknown size tf.TensorShape([None ]), # source vectors of unknown size tf.TensorShape([None]), tf.TensorShape([]), # size(source) tf.TensorShape([])), # size(source vocab) ( tf.TensorShape([None ]), # target vectors of unknown size tf.TensorShape([None ]), # target vectors of unknown size tf.TensorShape([None, None ]), # target vectors of unknown size tf.TensorShape([])), # size(source) ( tf.TensorShape([None ]), # target vectors of unknown size tf.TensorShape([None ]), # target vectors of unknown size tf.TensorShape([]), tf.TensorShape([None])), # size(target) tf.TensorShape([])), #ix padding_values=( ( PAD, self.vocab[ PAD], # source vectors padded on the right with src_eos_id 0, len(self.vocab), 0), # size(source) -- unused ( PAD, self.vocab[ PAD], # target vectors padded on the right with tgt_eos_id 0.0, 0), # size(source) -- unused ( PAD, self.vocab[ PAD], # target vectors padded on the right with tgt_eos_id 0, # answer len 0), # answer locs 0)) # ix dataset = dataset.repeat(self.num_epochs) dataset = dataset.prefetch(buffer_size=batch_size * 4) self.iterator = dataset.make_initializable_iterator() self.batch_as_nested_tuple = self.iterator.get_next() self.this_context, self.this_question, self.this_answer, self.this_ix = self.batch_as_nested_tuple (self.context_raw, self.context_ids, self.context_copy_ids, self.context_length, self.context_vocab_size) = self.this_context (self.question_raw, self.question_ids, self.question_oh, self.question_length) = self.this_question (self.answer_raw, self.answer_ids, self.answer_length, self.answer_locs) = self.this_answer self.batch_len = tf.shape(self.context_raw)[0]