Esempio n. 1
0
    def get_q(self, context, ans, ans_pos):
        # Process and create a batch of 1
        ctxt_feats = preprocessing.process_squad_context(
            self.model.vocab, context_as_set=FLAGS.context_as_set)(context)
        ans_feats = preprocessing.process_squad_answer(
            self.model.vocab,
            context_as_set=FLAGS.context_as_set)(ans, ans_pos, context)

        ctxt_feats[0] = np.array(ctxt_feats[0], dtype=bytes)
        ans_feats[0] = np.array(ans_feats[0], dtype=bytes)

        ctxt_dict = tuple([np.asarray([x]) for x in ctxt_feats])
        ans_dict = tuple([np.asarray([x]) for x in ans_feats])

        q, q_len = self.sess.run(
            [self.model.q_hat_beam_string, self.model.q_hat_beam_lens],
            feed_dict={
                self.model.context_in: ctxt_dict,
                self.model.answer_in: ans_dict
            })
        q_str = " ".join([
            w.decode().replace('>', '&gt;').replace('<', '&lt;')
            for w in q[0][:q_len[0] - 1]
        ])
        return q_str
Esempio n. 2
0
    def get_q_batch(self, contexts, answers, ans_pos):
        # Process and create a batch
        ctxt_feats = [[x for x in preprocessing.process_squad_context(self.model.vocab, context_as_set=FLAGS.context_as_set)(contexts[i])] for i in range(len(contexts))]
        ans_feats = [[x for x in preprocessing.process_squad_answer(self.model.vocab, context_as_set=FLAGS.context_as_set)(answers[i], ans_pos[i], contexts[i])] for i in range(len(contexts))]

        # Now zip to get batches of features, not batches of examples
        ctxt_feats = list(zip(*ctxt_feats))
        ans_feats = list(zip(*ans_feats))

        # pad
        ctxt_pad = ['<PAD>', 0, 0, 0, 0]
        ans_pad = ['<PAD>', 0, 0, 0]
        for i in range(len(ctxt_feats)):
            if i in [3,4]: # skip length feature
                continue
            max_len = max(len(feat) for feat in ctxt_feats[i])
            ctxt_feats[i] = [list(feat) + [ctxt_pad[i] for j in range(max_len - len(feat))] for feat in ctxt_feats[i]]
    
        for i in range(len(ans_feats)):
            if i in [2]: # skip length feature
                continue
            max_len = max(len(feat) for feat in ans_feats[i])
            ans_feats[i] = [list(feat) + [ans_pad[i] for j in range(max_len - len(feat))] for feat in ans_feats[i]]

        # Needed to handle weird unicode stuff properly
        ctxt_feats[0] = np.array(ctxt_feats[0], dtype=bytes)
        ans_feats[0] = np.array(ans_feats[0], dtype=bytes)

        ctxt_dict = tuple([np.array(x) for i, x in enumerate(ctxt_feats)])
        ans_dict = tuple([np.array(x) for i, x in enumerate(ans_feats)])

        # ctxt_dict = ctxt_feats
        # ans_dict = ans_feats

        qs, q_lens = self.sess.run([self.model.q_hat_beam_string,self.model.q_hat_beam_lens], feed_dict={self.model.context_in: ctxt_dict, self.model.answer_in: ans_dict})
        q_str = [" ".join([w.decode().replace('>','&gt;').replace('<','&lt;') for w in qs[i][:q_lens[i]-1]]) for i in range(len(qs))]
        return q_str
Esempio n. 3
0
    def build_data_pipeline(self, batch_size):
        with tf.device('/cpu:*'):
            self.context_ph = tf.placeholder(tf.string, [None])
            self.qs_ph = tf.placeholder(tf.string, [None])
            self.as_ph = tf.placeholder(tf.string, [None])
            self.a_pos_ph = tf.placeholder(tf.int32, [None])
            self.ix = tf.placeholder(tf.int32, [None])

            dataset = tf.data.Dataset.from_tensor_slices(
                (self.context_ph, self.qs_ph, self.as_ph, self.a_pos_ph,
                 self.ix))

            if self.shuffle:
                dataset = dataset.shuffle(buffer_size=100000)

            # processing pipeline
            dataset = dataset.map(lambda context, q, a, a_pos, ix: (
                tuple(
                    tf.py_func(
                        preprocessing.process_squad_context(
                            self.vocab, context_as_set=FLAGS.context_as_set),
                        [context],
                        [tf.string, tf.int32, tf.int32, tf.int32, tf.int32])),
                tuple(
                    tf.py_func(
                        preprocessing.process_squad_question(
                            self.vocab,
                            max_copy_size=FLAGS.max_copy_size,
                            context_as_set=FLAGS.context_as_set,
                            copy_priority=FLAGS.copy_priority,
                            smart_copy=FLAGS.smart_copy,
                            latent_switch=FLAGS.latent_switch), [
                                q, context, a_pos
                            ], [tf.string, tf.int32, tf.float32, tf.int32])),
                tuple(
                    tf.py_func(
                        preprocessing.process_squad_answer(
                            self.vocab, context_as_set=FLAGS.context_as_set), [
                                a, a_pos, context
                            ], [tf.string, tf.int32, tf.int32, tf.int32])), ix
                # q,a
            ))

            # pad out to batches
            dataset = dataset.padded_batch(
                batch_size,
                padded_shapes=(
                    (
                        tf.TensorShape([None
                                        ]),  # source vectors of unknown size
                        tf.TensorShape([None
                                        ]),  # source vectors of unknown size
                        tf.TensorShape([None]),
                        tf.TensorShape([]),  # size(source)
                        tf.TensorShape([])),  # size(source vocab)
                    (
                        tf.TensorShape([None
                                        ]),  # target vectors of unknown size
                        tf.TensorShape([None
                                        ]),  # target vectors of unknown size
                        tf.TensorShape([None, None
                                        ]),  # target vectors of unknown size
                        tf.TensorShape([])),  # size(source)
                    (
                        tf.TensorShape([None
                                        ]),  # target vectors of unknown size
                        tf.TensorShape([None
                                        ]),  # target vectors of unknown size
                        tf.TensorShape([]),
                        tf.TensorShape([None])),  # size(target)
                    tf.TensorShape([])),  #ix
                padding_values=(
                    (
                        PAD,
                        self.vocab[
                            PAD],  # source vectors padded on the right with src_eos_id
                        0,
                        len(self.vocab),
                        0),  # size(source) -- unused
                    (
                        PAD,
                        self.vocab[
                            PAD],  # target vectors padded on the right with tgt_eos_id
                        0.0,
                        0),  # size(source) -- unused
                    (
                        PAD,
                        self.vocab[
                            PAD],  # target vectors padded on the right with tgt_eos_id
                        0,  # answer len
                        0),  # answer locs
                    0))  # ix

            dataset = dataset.repeat(self.num_epochs)

            dataset = dataset.prefetch(buffer_size=batch_size * 4)

            self.iterator = dataset.make_initializable_iterator()
            self.batch_as_nested_tuple = self.iterator.get_next()
            self.this_context, self.this_question, self.this_answer, self.this_ix = self.batch_as_nested_tuple
            (self.context_raw, self.context_ids, self.context_copy_ids,
             self.context_length, self.context_vocab_size) = self.this_context
            (self.question_raw, self.question_ids, self.question_oh,
             self.question_length) = self.this_question
            (self.answer_raw, self.answer_ids, self.answer_length,
             self.answer_locs) = self.this_answer

            self.batch_len = tf.shape(self.context_raw)[0]