Example #1
0
class HighDimDQN(DQN):
    def __init__(self, reading_ph, pose_ph, input_tensor, action_dim,
                 memory_size):
        self.reading_ph = reading_ph
        self.pose_ph = pose_ph
        self.input = input_tensor

        self.action_dim = action_dim
        self.action_ph = tf.placeholder(tf.float32, [None, action_dim])

        reading_dim = reading_ph.get_shape()[-1]
        pose_dim = pose_ph.get_shape()[-1]
        self.memory = GeneralMemory(
            memory_size,
            [reading_dim, pose_dim, 0, 1, reading_dim, pose_dim, 0])

    def train(self, session, batch=None, discount=.97):
        reading, pose, actions, rewards, next_reading, next_pose, terminals = self.memory.sample(
            batch)
        action_value = session.run(self.action_value, {
            self.reading_ph: reading,
            self.pose_ph: pose
        })
        next_state_value = session.run(self.action_value, {
            self.reading_ph: next_reading,
            self.pose_ph: next_pose
        })
        observed_value = rewards + discount * \
            np.max(next_state_value, 1, keepdims=True)
        observed_value[terminals] = rewards[terminals] / (1 - discount)
        action_value[np.arange(batch), actions] = observed_value[:, 0]

        _, l = session.run(
            [self.train_op, self._loss], {
                self.reading_ph: reading,
                self.pose_ph: pose,
                self.action_ph: action_value
            })
        return l

    def policy(self, session, reading, pose):
        return session.run(self.action_value, {
            self.reading_ph: [reading],
            self.pose_ph: [pose]
        })[0]

    def memorize(self, reading, pose, action, reward, next_reading, next_pose,
                 terminal):
        self.memory.add(
            [reading, pose, action, reward, next_reading, next_pose, terminal])
Example #2
0
class ImaginationDQN(DQN):
    def __init__(self, state_dim, action_dim, memory_size):
        super(ImaginationDQN, self).__init__(state_dim, action_dim,
                                             memory_size)
        self.state_dim = state_dim
        self.reward_ph = tf.placeholder(tf.float32, [None, 1], "rewards")
        self.next_state_ph = tf.placeholder(tf.float32, [None, state_dim],
                                            "next_states")
        self.termination_ph = tf.placeholder(tf.float32, [None, 1],
                                             'terminations')
        self.fictional_memory = GeneralMemory(memory_size, state_dim, 0, 1,
                                              state_dim, 1)

    def initializeImagination(self, layer_dims=[30, 30]):
        with tf.variable_scope("imagination"):
            flow = self.state
            for i, size in enumerate(layer_dims):
                flow = fullyConnected("layer%i" % i, flow, size, tf.nn.relu,
                                      .003)

            predict_state = fullyConnected("state_prediction",
                                           flow,
                                           self.action_dim * self.state_dim,
                                           initializer=.003)
            predict_state = tf.reshape(predict_state,
                                       (-1, self.action_dim, self.state_dim))
            predict_reward = fullyConnected("reward_prediction",
                                            flow,
                                            self.action_dim,
                                            initializer=.003)
            predict_reward = tf.reshape(predict_reward,
                                        (-1, self.action_dim, 1))
            predict_termination = fullyConnected("termination_prediction",
                                                 flow,
                                                 self.action_dim,
                                                 activation=tf.nn.sigmoid,
                                                 initializer=.003)
            predict_termination = tf.reshape(predict_termination,
                                             (-1, self.action_dim, 1))

        indexes = tf.stack([tf.range(0,
                                     tf.shape(flow)[0]), self.action_ph],
                           axis=1)
        self.predict_state = tf.gather_nd(predict_state, indexes)
        self.predict_reward = tf.gather_nd(predict_reward, indexes)
        self.predict_termination = tf.gather_nd(predict_termination, indexes)

        self.imagination_loss = tf.reduce_mean(tf.square(
            self.next_state_ph - self.predict_state)) +\
            tf.reduce_mean(tf.square(self.predict_reward - self.reward_ph))

        self.imagination_train_op = tf.train.AdamOptimizer(.01).minimize(
            self.imagination_loss)

    def imagine(self, session, batch, step=1):
        state = self.memory.sample(batch)[0]
        for _ in range(step):
            action = np.argmax(session.run(self.action_value,
                                           {self.state: state}),
                               axis=1)

            reward, next_state, termination = session.run(
                [
                    self.predict_reward, self.predict_state,
                    self.predict_termination
                ], {
                    self.state: state,
                    self.action_ph: action
                })
            self.fictional_memory.addBatch(state, action, reward, next_state,
                                           termination)
            state = next_state

    def trainImagination(self, session, batch):
        state, action, reward, next_state, termination = self.memory.sample(
            batch)
        imagination_loss, _ = session.run(
            [self.imagination_loss, self.imagination_train_op], {
                self.state: state,
                self.action_ph: action,
                self.next_state_ph: next_state,
                self.reward_ph: reward,
                self.termination_ph: termination[:, None]
            })
        return imagination_loss

    def train(self, session, batch, discount=.97, step=1):
        state, action, reward, next_state, termination = self.memory.sample(
            batch)
        imagination_loss, _ = session.run(
            [self.imagination_loss, self.imagination_train_op], {
                self.state: state,
                self.action_ph: action,
                self.next_state_ph: next_state,
                self.reward_ph: reward,
                self.termination_ph: termination[:, None]
            })

        next_state_value = session.run(self.target_action_value,
                                       {self.state: next_state})
        observed_value = reward + discount * \
            np.max(next_state_value, 1, keepdims=True)
        observed_value[termination] = reward[termination]  # / (1 - discount)

        _, loss = session.run(
            [self.train_op, self._loss], {
                self.state: state,
                self.action_ph: action,
                self.action_value_ph: observed_value[:, 0]
            })

        for _ in range(step):
            state, action, reward, next_state, termination = self.fictional_memory.sample(
                batch)
            next_state_value = session.run(self.target_action_value,
                                           {self.state: next_state})
            observed_value = reward + discount * \
                np.max(next_state_value, 1, keepdims=True)
            observed_value = observed_value * \
                (1 - termination) + reward * termination# / (1 - discount)

            _, l = session.run(
                [self.train_op, self._loss], {
                    self.state: state,
                    self.action_ph: action,
                    self.action_value_ph: observed_value[:, 0]
                })
            loss += l

        session.run(self.update_op)
        return loss / (step + 1), imagination_loss
Example #3
0
class VisualDQN(ImaginationDQN):
    def __init__(self, frame_shape, action_dim, memory_size):
        self.action_dim = action_dim
        state_dim = 51
        frame_shape = list(frame_shape)
        self.frame_shape = frame_shape
        frame_shape[2] = 1
        self.frame_ph = tf.placeholder(tf.float32, [None] + frame_shape[:2],
                                       "frame")
        self.action_ph = tf.placeholder(tf.int32, [None], "actions")
        self.reward_ph = tf.placeholder(tf.float32, [None, 1], "rewards")
        self.next_frame_ph = tf.placeholder(tf.float32,
                                            [None] + frame_shape[:2],
                                            "next_frame")
        self.termination_ph = tf.placeholder(tf.float32, [None, 1],
                                             'terminations')

        self.action_value_ph = tf.placeholder(tf.float32, [None],
                                              "action_values")

        self.memory = GeneralMemory(memory_size, frame_shape[:-1], 0, 1,
                                    frame_shape[:-1], -1)
        self.fictional_memory = GeneralMemory(memory_size, state_dim, 0, 1,
                                              state_dim, 1)

    def initializeImagination(self, before_action=[30], after_action=[30]):
        initializer = tf.random_uniform_initializer(-0.005, 0.005)
        stride = (2, 2)
        patch = (4, 4)
        depth = (64, 128, 128, 128)

        depth = (1, ) + depth

        shapes = [self.frame_shape[:2]]
        for i in range(len(depth)):
            next_shape = [
                -(-shapes[i][0] // stride[0]), -(-shapes[i][1] // stride[1])
            ]
            shapes.append(next_shape)

        print("Conv dimensions are", shapes)
        shapes = list(reversed(shapes))

        conv_params = []
        for i in range(len(depth[:-1])):
            w = tf.get_variable("conv_weight_%i" % i,
                                shape=patch + (depth[i], depth[i + 1]),
                                initializer=initializer,
                                dtype=tf.float32)
            b = tf.Variable(tf.constant(.01, shape=[depth[i + 1]]))
            conv_params.append((w, b))

        deconv_params = []
        for i in range(len(depth[:-1])):
            w = tf.get_variable("deconv_weight_%i" % i,
                                shape=patch + (depth[i], depth[i + 1]),
                                initializer=initializer,
                                dtype=tf.float32)
            b = tf.Variable(tf.constant(.01, shape=[depth[i]]))
            deconv_params.append((w, b))
        deconv_params = list(reversed(deconv_params))

        def _makeConv(input):
            flow = input
            for w, b in conv_params:
                print("in conv", flow.get_shape(), w.get_shape())
                flow = tf.nn.conv2d(flow, w, [1, 1, 1, 1], padding='SAME') + b
                flow = tf.nn.max_pool(flow, (1, ) + stride + (1, ),
                                      (1, ) + stride + (1, ),
                                      padding='SAME')
                flow = tf.nn.relu(flow)
            return flow

        def _makeDeconv(input):
            flow = input
            shape = input.get_shape().as_list()
            batch_size, height, width, depth = shape
            batch_size = tf.shape(input)[0]
            for i, p in enumerate(deconv_params):
                w, b = p
                next_depth, depth = w.get_shape().as_list()[-2:]
                height, width = shapes[i + 2]
                print("hw", height, width)
                flow = tf.nn.conv2d_transpose(
                    flow,
                    w,
                    strides=(1, ) + stride + (1, ),
                    output_shape=[batch_size, height, width, next_depth],
                    padding="SAME")
                print("deconv shape", [batch_size, height, width, next_depth])
            return flow

        batch_size = tf.shape(self.frame_ph)[0]
        frame = tf.expand_dims(self.frame_ph, 3)
        next_frame = tf.expand_dims(self.next_frame_ph, 3)
        encoded_1 = _makeConv(frame)
        deconved_1 = _makeDeconv(encoded_1)
        loss_1 = tf.reduce_mean(tf.square(frame - deconved_1))
        self.deconved_1 = deconved_1

        encoded_2 = _makeConv(next_frame)
        deconved_2 = _makeDeconv(encoded_2)
        loss_2 = tf.reduce_mean(tf.square(next_frame - deconved_2))
        self.deconved_2 = deconved_2

        self.conv_loss = loss_1 + loss_2
        self.conv_train_op = tf.train.AdamOptimizer(.001).minimize(
            self.conv_loss)

        shape = encoded_1.get_shape().as_list()
        print("encoded dims", shape)
        encoded_1_flat = tf.reshape(encoded_1,
                                    (-1, shape[3] * shape[2] * shape[1]))
        self.state = encoded_1_flat

        encoded_2_flat = tf.reshape(encoded_2,
                                    (-1, shape[3] * shape[2] * shape[1]))
        self.next_state = encoded_2_flat

        self.state_dim = self.state.get_shape().as_list()[1]
        print("state dim", self.state_dim)

        self.state_ph = tf.placeholder(
            tf.float32, [shape[0], shape[1] * shape[2] * shape[3]],
            name='encoded')
        self.next_state_ph = tf.placeholder(
            tf.float32, [shape[0], shape[1] * shape[2] * shape[3]],
            name="next_encoded")

        indexes = tf.stack(
            [tf.range(0,
                      tf.shape(self.action_ph)[0]), self.action_ph], axis=1)

        with tf.variable_scope("imagination"):
            net = self.state_ph
            for i, size in enumerate(before_action[:-1]):
                net = fullyConnected("before_action_layer_%i" % i,
                                     net,
                                     size,
                                     tf.nn.relu,
                                     initializer=.003)
            net = fullyConnected("layer_0",
                                 net,
                                 self.action_dim * before_action[-1],
                                 tf.nn.relu,
                                 initializer=.003)
            net = tf.reshape(net, (-1, self.action_dim, before_action[-1]))
            net = tf.gather_nd(net, indexes)

            for i, size in enumerate(after_action):
                net = fullyConnected("layer%i" % (i + 1), net, size,
                                     tf.nn.relu, .003)

            self.predict_state = fullyConnected("state_prediction",
                                                net,
                                                self.state_dim,
                                                initializer=.003)
            self.predict_reward = fullyConnected("reward_prediction",
                                                 net,
                                                 1,
                                                 initializer=.003)
            self.predict_termination = fullyConnected("termination_prediction",
                                                      net,
                                                      1,
                                                      activation=tf.nn.sigmoid,
                                                      initializer=.003)

        self.imagination_single_loss = tf.square(self.next_state_ph -
                                                 self.predict_state)
        self.imagination_loss = tf.reduce_mean(self.imagination_single_loss) +\
            tf.reduce_mean(tf.square(self.predict_reward - self.reward_ph)) +\
            tf.reduce_mean(
                tf.square(self.predict_termination - self.termination_ph))

        self.imagination_train_op = tf.train.AdamOptimizer(.001).minimize(
            self.imagination_loss)

    def imagine(self, session, batch, step=1):
        if step < 10:
            return
        reading, pose = self.memory.sample(batch)[:2]
        enc = session.run(self.state, {self.reading_ph: reading})

        state = session.run(self.state, {
            self.state_ph: enc,
            self.pose_ph: pose
        })
        for _ in range(step):
            action = np.argmax(session.run(self.action_value,
                                           {self.state: state}),
                               axis=1)

            reward, next_state, termination = session.run(
                [
                    self.predict_reward, self.predict_state,
                    self.predict_termination
                ], {
                    self.state: state,
                    self.action_ph: action
                })
            self.fictional_memory.addBatch(state, action, reward, next_state,
                                           termination)
            state = next_state
        pass

    def train(self, session, batch=None, discount=.97, step=0):
        state, action, reward, next_state, termination = self.memory.sample(
            batch)
        # reading = reading/20
        # next_reading = next_reading/20

        if step:
            r = session.run([self.deconved_1, self.deconved_2], {
                self.frame_ph: state,
                self.next_frame_ph: next_state
            })
            print(np.mean(state - r[0][:, :, :, 0]),
                  np.mean(next_state - r[1][:, :, :, 0]))
            print(np.mean(next_state - r[1][:, :, :, 0]),
                  np.mean(state - r[0][:, :, :, 0]))
            fig = figure()
            a = fig.add_subplot(1, 3, 1)
            imshow(state[0], cmap='Greys_r')

            a = fig.add_subplot(1, 3, 2)
            imshow(r[0][0, :, :, 0], cmap='Greys_r')

            a = fig.add_subplot(1, 3, 3)
            imshow(state[0] - r[0][0, :, :, 0], cmap='Greys_r')
            show()
            raise

        _, conv_loss, enc_1, enc_2 = session.run(
            [self.conv_train_op, self.conv_loss, self.state, self.next_state],
            {
                self.frame_ph: state,
                self.next_frame_ph: next_state
            })
        # print(enc_1)
        imagination_loss, _ = session.run(
            [self.imagination_loss, self.imagination_train_op], {
                self.state_ph: enc_1,
                self.action_ph: action,
                self.next_state_ph: enc_2,
                self.reward_ph: reward,
                self.termination_ph: termination[:, None]
            })

        next_state_value = session.run(self.target_action_value,
                                       {self.state_ph: enc_2})
        observed_value = reward + discount * \
            np.max(next_state_value, 1, keepdims=True)
        observed_value[termination] = reward[termination]  # / (1 - discount)

        _, loss, sl = session.run(
            [self.train_op, self._loss, self.single_loss], {
                self.state_ph: enc_1,
                self.action_ph: action,
                self.action_value_ph: observed_value[:, 0]
            })

        for _ in range(step):
            state, action, reward, next_state, termination = self.fictional_memory.sample(
                batch)
            next_state_value = session.run(self.target_action_value,
                                           {self.state: next_state})
            observed_value = reward + discount * \
                np.max(next_state_value, 1, keepdims=True)
            observed_value = observed_value * \
                (1 - termination) + reward * termination  # / (1 - discount)

            _, l = session.run(
                [self.train_op, self._loss], {
                    self.state: state,
                    self.action_ph: action,
                    self.action_value_ph: observed_value[:, 0]
                })
            loss += l

        # session.run(self.update_op)
        return loss / (step + 1), imagination_loss, np.mean(conv_loss)

    def initialize(self, layer_dims=[30, 30], optimizer=None):
        def _make():
            flow = self.state_ph
            for i, size in enumerate(layer_dims):
                flow = fullyConnected("layer%i" % i,
                                      flow,
                                      size,
                                      tf.nn.relu,
                                      initializer=.003)

            return fullyConnected("output_layer",
                                  flow,
                                  self.action_dim,
                                  initializer=.003)

        with tf.variable_scope('learner'):
            self.action_value = _make()
        with tf.variable_scope('target'):
            self.target_action_value = _make()

        self.update_op = copyScopeVars('learner', 'target')

        row = tf.range(0, tf.shape(self.action_value)[0])
        indexes = tf.stack([row, self.action_ph], axis=1)
        action_value = tf.gather_nd(self.action_value, indexes)

        self.single_loss = tf.square(action_value - self.action_value_ph)
        self._loss = tf.reduce_mean(self.single_loss)

        self.optimizer = optimizer or tf.train.AdamOptimizer(.001)

        # parameters = getScopeParameters('learner')
        # grads = tf.gradients(self._loss, parameters)
        # self.train_op = self.optimizer.apply_gradients(zip(grads, parameters))

        self.train_op = self.optimizer.minimize(self._loss)

    def policy(self, session, frame):
        encoded = session.run(self.state, {self.frame_ph: [frame]})
        return session.run(self.action_value, {self.state_ph: encoded})[0]

    def memorize(self, frame, action, reward, next_frame, terminal):
        self.memory.add(frame, action, reward, next_frame, terminal)
Example #4
0
class HighDimImaginationDQN(ImaginationDQN):
    def __init__(self, reading_dim, pose_dim, action_dim, memory_size):
        self.action_dim = action_dim
        self.reading_dim = reading_dim
        self.memory_size = memory_size

        self.reading_ph = tf.placeholder(tf.float32, [None, reading_dim],
                                         "reading")
        self.pose_ph = tf.placeholder(tf.float32, [None, pose_dim], "pose")
        self.action_ph = tf.placeholder(tf.int32, [None], "actions")
        self.reward_ph = tf.placeholder(tf.float32, [None, 1], "rewards")
        self.next_reading_ph = tf.placeholder(tf.float32, [None, reading_dim],
                                              "next_reading")
        self.next_pose_ph = tf.placeholder(tf.float32, [None, pose_dim],
                                           "next_pose")
        self.termination_ph = tf.placeholder(tf.float32, [None, 1],
                                             'terminations')

        self.action_value_ph = tf.placeholder(tf.float32, [None],
                                              "action_values")

        self.memory = GeneralMemory(memory_size, reading_dim, pose_dim, 0, 1,
                                    reading_dim, pose_dim, -1)

    def initializeImagination(self, layer_dims=[30, 30]):
        # initializer = tf.random_uniform_initializer(-0.005, 0.005)
        # stride = (1, 3)
        # patch = (1, 5)
        # depth = (4, 4, 8)

        # depth = (1,) + depth

        # shapes = [[1, self.reading_dim]]
        # for i in range(len(depth)):
        #     next_shape = [-(-shapes[i][0] // stride[0]), -
        #                   (-shapes[i][1] // stride[1])]
        #     shapes.append(next_shape)

        # print("Conv dimensions are", shapes)
        # shapes = list(reversed(shapes))

        # conv_params = []
        # for i in range(len(depth[:-1])):
        #     w = tf.get_variable("conv_weight_%i" % i, shape=patch + (
        #         depth[i], depth[i + 1]), initializer=initializer, dtype=tf.float32)
        #     b = tf.Variable(tf.constant(.01, shape=[depth[i + 1]]))
        #     conv_params.append((w, b))

        # deconv_params = []
        # for i in range(len(depth[:-1])):
        #     w = tf.get_variable("deconv_weight_%i" % i, shape=patch + (
        #         depth[i], depth[i + 1]), initializer=initializer, dtype=tf.float32)
        #     b = tf.Variable(tf.constant(.01, shape=[depth[i]]))
        #     deconv_params.append((w, b))
        # deconv_params = list(reversed(deconv_params))

        # def _makeConv(input):
        #     flow = input
        #     for w, b in conv_params:
        #         print("in conv", flow.get_shape(), w.get_shape())
        #         flow = tf.nn.conv2d(flow, w, [1, 1, 1, 1], padding='SAME') + b
        #         flow = tf.nn.max_pool(
        #             flow, (1,) + stride + (1,), (1,) + stride + (1,), padding='SAME')
        #         flow = tf.nn.relu(flow)
        #     return flow

        # def _makeDeconv(input):
        #     flow = input
        #     shape = input.get_shape().as_list()
        #     batch_size, height, width, depth = shape
        #     batch_size = tf.shape(input)[0]
        #     for i, p in enumerate(deconv_params):
        #         w, b = p
        #         next_depth, depth = w.get_shape().as_list()[-2:]
        #         height, width = shapes[i + 2]
        #         print("hw", height, width)
        #         flow = tf.nn.conv2d_transpose(flow, w, strides=(1,) + stride + (1,), output_shape=[
        #                                       batch_size, height, width, next_depth], padding="SAME")
        #         print("deconv shape", [batch_size, height, width, next_depth])
        #     return flow

        # batch_size = tf.shape(self.reading_ph)[0]

        # encoded_1 = _makeConv(tf.reshape(
        #     self.reading_ph, [batch_size, 1, 150, 1]))
        # deconved_1 = tf.squeeze(_makeDeconv(encoded_1))
        # loss_1 = tf.reduce_mean(tf.square(self.reading_ph - deconved_1))

        # encoded_2 = _makeConv(tf.reshape(
        #     self.next_reading_ph, [batch_size, 1, 150, 1]))
        # deconved_2 = tf.squeeze(_makeDeconv(encoded_2))
        # loss_2 = tf.reduce_mean(
        #     tf.square(self.next_reading_ph - deconved_2))

        # self.deconved = deconved_1
        # self.next_deconved = deconved_2

        # shape = encoded_1.get_shape().as_list()
        # encoded_1_flat = tf.reshape(encoded_1, (-1, shape[3] * shape[2]))
        # shape = encoded_2.get_shape().as_list()
        # encoded_2_flat = tf.reshape(encoded_2, (-1, shape[3] * shape[2]))
        # self.encoded = encoded_1_flat
        # self.next_encoded = encoded_2_flat
        # print(shape)

        # self.encoded_ph = tf.placeholder(
        #     tf.float32, [shape[0], shape[2] * shape[3]], name='encoded')
        # self.next_encoded_ph = tf.placeholder(
        #     tf.float32, [shape[0], shape[2] * shape[3]], name="next_encoded")
        # self.state = tf.concat([self.encoded_ph, self.pose_ph], 1)
        # self.next_state = tf.concat(
        #     [self.next_encoded_ph, self.next_pose_ph], 1)
        self.state = self.pose_ph
        self.next_state = self.next_pose_ph

        state_dim = self.state.get_shape().as_list()[1]
        self.fictional_memory = GeneralMemory(self.memory_size, state_dim, 0,
                                              1, state_dim, 1)

        print("state shape", state_dim)

        with tf.variable_scope("imagination"):
            flow = self.state
            for i, size in enumerate(layer_dims):
                flow = fullyConnected("layer%i" % i, flow, size, tf.nn.relu,
                                      .003)

            predict_state = fullyConnected("state_prediction",
                                           flow,
                                           self.action_dim * state_dim,
                                           initializer=.003)
            predict_state = tf.reshape(predict_state,
                                       (-1, self.action_dim, state_dim))
            predict_reward = fullyConnected("reward_prediction",
                                            flow,
                                            self.action_dim,
                                            initializer=.003)
            predict_reward = tf.reshape(predict_reward,
                                        (-1, self.action_dim, 1))
            predict_termination = fullyConnected("termination_prediction",
                                                 flow,
                                                 self.action_dim,
                                                 activation=tf.nn.sigmoid,
                                                 initializer=.003)
            predict_termination = tf.reshape(predict_termination,
                                             (-1, self.action_dim, 1))

        indexes = tf.stack([tf.range(0,
                                     tf.shape(flow)[0]), self.action_ph],
                           axis=1)
        self.predict_state = tf.gather_nd(predict_state, indexes)
        self.predict_reward = tf.gather_nd(predict_reward, indexes)
        self.predict_termination = tf.gather_nd(predict_termination, indexes)

        # self.conv_loss = loss_1 + loss_2
        # self.conv_train_op = tf.train.AdamOptimizer(
        #     .01).minimize(self.conv_loss)

        self.imagination_single_loss = tf.square(self.next_state -
                                                 self.predict_state)
        self.imagination_loss = tf.reduce_mean(self.imagination_single_loss) +\
            tf.reduce_mean(tf.square(self.predict_reward - self.reward_ph)) +\
            tf.reduce_mean(
                tf.square(self.predict_termination - self.termination_ph))

        self.imagination_train_op = tf.train.AdamOptimizer(.01).minimize(
            self.imagination_loss)

    def imagine(self, session, batch, step=1):
        if step == 0:
            return
        reading, pose = self.memory.sample(batch)[:2]
        # enc = session.run(self.encoded, {self.reading_ph: reading})

        state = session.run(self.state, {self.pose_ph: pose})
        for _ in range(step):
            action = np.argmax(session.run(self.action_value,
                                           {self.state: state}),
                               axis=1)

            reward, next_state, termination = session.run(
                [
                    self.predict_reward, self.predict_state,
                    self.predict_termination
                ], {
                    self.state: state,
                    self.action_ph: action
                })
            self.fictional_memory.addBatch(state, action, reward, next_state,
                                           termination)
            state = next_state
        pass

    def train(self, session, batch=None, discount=.97, step=0):
        reading, pose, action, reward, next_reading, next_pose, termination = self.memory.sample(
            batch)

        # _, conv_loss, enc_1, enc_2 = session.run([self.conv_train_op, self.conv_loss, self.encoded, self.next_encoded], {
        #     self.reading_ph: reading, self.next_reading_ph: next_reading})

        imagination_loss, _ = session.run(
            [self.imagination_loss, self.imagination_train_op], {
                self.pose_ph: pose,
                self.action_ph: action,
                self.next_pose_ph: next_pose,
                self.reward_ph: reward,
                self.termination_ph: termination[:, None]
            })

        next_state_value = session.run(self.target_action_value,
                                       {self.pose_ph: next_pose})
        observed_value = reward + discount * \
            np.max(next_state_value, 1, keepdims=True)
        observed_value[termination] = reward[termination] / (1 - discount)

        _, loss = session.run(
            [self.train_op, self._loss], {
                self.pose_ph: pose,
                self.action_ph: action,
                self.action_value_ph: observed_value[:, 0]
            })

        # next_state_value = session.run(
        #     self.target_action_value, {self.encoded_ph: enc_2, self.pose_ph: next_pose})
        # observed_value = reward + discount * \
        #     np.max(next_state_value, 1, keepdims=True)
        # observed_value[termination] = reward[termination]  # / (1 - discount)

        # _, loss = session.run([self.train_op, self._loss], {
        #     self.encoded_ph: enc_1,
        #     self.pose_ph: pose,
        #     self.action_ph: action,
        #     self.action_value_ph: observed_value[:, 0]})

        for _ in range(step):
            state, action, reward, next_state, termination = self.fictional_memory.sample(
                batch)
            next_state_value = session.run(self.target_action_value,
                                           {self.state: next_state})
            observed_value = reward + discount * \
                np.max(next_state_value, 1, keepdims=True)
            observed_value = observed_value * \
                (1 - termination) + reward * termination / (1 - discount)

            _, l = session.run(
                [self.train_op, self._loss], {
                    self.state: state,
                    self.action_ph: action,
                    self.action_value_ph: observed_value[:, 0]
                })
            loss += l

        return loss / (step + 1), imagination_loss, 0  # np.mean(conv_loss)

    def policy(self, session, reading, pose):
        # encoded = session.run(self.encoded, {self.reading_ph: [reading]})
        return session.run(self.action_value, {self.pose_ph: [pose]})[0]

    def memorize(self, reading, pose, action, reward, next_reading, next_pose,
                 terminal):
        self.memory.add(reading, pose, action, reward, next_reading, next_pose,
                        terminal)
Example #5
0
class ExpActorCritic2(ActorCritic):
    def __init__(self, state_dim, action_dim, memory_size, shared_net=[]):
        self.action_dim = action_dim
        self.state = tf.placeholder(tf.float32, (None, state_dim),
                                    name='state')

        self.shared = self.state
        with tf.variable_scope("shared"):
            for i, layer in enumerate(shared_net):
                self.shared = fullyConnected("layer_%i" % i, self.shared,
                                             layer, tf.nn.relu)

        self.action_ph = tf.placeholder(tf.int32, (None))
        self.grad_ph = tf.placeholder(tf.float32, (None, 1))
        self.value_ph = tf.placeholder(tf.float32, (None, 1))

        self.memory = GeneralMemory(memory_size, state_dim, 0, 1, state_dim,
                                    -1)
        self.com_memory = GeneralMemory(memory_size, state_dim, 0, 1,
                                        state_dim, -1)
        self.com_average = RunningAverage(.001)

    def initializeActor(self, layers=[400, 300], optimizer=None, entropy=0.0):
        optimizer = optimizer or tf.train.AdamOptimizer(.0001)

        def _make():
            net = self.shared
            for i, layer in enumerate(layers):
                net = fullyConnected("layer_%i" % i, net, layer, tf.nn.relu,
                                     .01)
            actions = fullyConnected("actor_output", net, self.action_dim,
                                     tf.nn.softmax, .01)
            return actions

        with tf.variable_scope("actor"):
            self.actor = _make()

        row = tf.expand_dims(tf.range(0, tf.shape(self.actor)[0]), axis=1)
        col = tf.expand_dims(self.action_ph, axis=1)
        indexes = tf.concat([row, col], axis=1)
        action_probibility = tf.gather_nd(self.actor, indexes)

        self.single_loss = tf.log(action_probibility + 1e-8) * self.grad_ph
        self.actor_loss = -tf.reduce_sum(self.single_loss)
        if entropy != 0.0:
            self.actor_loss -= entropyLoss(self.actor) * entropy

        self.train_actor = optimizer.minimize(self.actor_loss)

    def initializeCritic(self, layers=[400, 300], optimizer=None):
        optimizer = optimizer or tf.train.AdamOptimizer(.01)

        def _make():
            net = self.shared
            for i, layer in enumerate(layers):
                net = fullyConnected("layer_%i" % i, net, layer, tf.nn.relu,
                                     .1)
            return fullyConnected("critic_output", net, 1, initializer=.001)

        with tf.variable_scope("critic"):
            self.critic = _make()

        self.critic_single_loss = tf.square(self.value_ph - self.critic)
        self.critic_loss = tf.reduce_mean(self.critic_single_loss)
        self.get_grad = tf.gradients(self.critic_single_loss, self.critic)
        self.clipped_grad_ph = tf.placeholder(tf.float32, [None, 1])

        variables = getScopeParameters("critic")
        grads = tf.gradients(self.critic, variables, self.clipped_grad_ph)
        clipped = zip(grads, variables)

        self.train_critic = optimizer.apply_gradients(clipped)

    def _train(self, memory, session, batch_size, discount=.97):
        try:
            states, actions, rewards, next_states, terminals = memory.sample(
                batch_size)
        except ValueError:
            states, actions, rewards, next_states, terminals = self.memory.sample(
                batch_size)
        next_state_value = session.run(self.critic, {self.state: next_states})
        state_value = session.run(self.critic, {self.state: states})
        observed_value = rewards + discount * next_state_value
        observed_value[terminals] = rewards[terminals] / (1 - discount)

        grad, single_loss, loss = session.run(
            [self.get_grad, self.critic_single_loss, self.critic_loss], {
                self.value_ph: observed_value,
                self.state: states
            })

        self.com_average.update(loss)
        return states, actions, observed_value - state_value, grad[0]

    def train(self, session, batch_size, discount=.97):
        s, a, v, g = self._train(self.memory, session, batch_size, discount)
        cs, ca, cv, cg = self._train(self.com_memory, session, batch_size,
                                     discount)

        session.run(
            self.train_critic, {
                self.state: np.concatenate([s, cs]),
                self.clipped_grad_ph: np.concatenate([g, cg])
            })

        _, actor_l = session.run(
            [self.train_actor, self.actor_loss], {
                self.state: np.concatenate([s, cs]),
                self.action_ph: np.concatenate([a, ca]),
                self.grad_ph: np.concatenate([v, cv])
            })

        return 0

    def grad(self):
        return np.min(np.abs(self.g))

    def memorize(self, state, action, reward, next_state, terminal, session):
        next_state_value = session.run(self.critic,
                                       {self.state: [next_state]})[0][0]
        state_value = session.run(self.critic, {self.state: [state]})[0][0]
        observed_value = reward + .97 * next_state_value
        loss = np.square(observed_value - state_value)

        if loss < self.com_average.avg or self.memory.count() == 0:
            self.memory.add(state, action, reward, next_state, terminal)
            return 0, loss
        else:
            self.com_memory.add(state, action, reward, next_state, terminal)
            return 1, loss

    def sleep(self, session, duration):
        start_time = time()
        while self.grad() > .001:
            self.train(session, 8)
            if time() - start_time > duration:
                print("time up")
                return
Example #6
0
class ExpActorCritic(ActorCritic):
    def __init__(self, state_dim, action_dim, memory_size, shared_net=[]):
        self.action_dim = action_dim
        self.state = tf.placeholder(tf.float32, (None, state_dim))

        self.shared = self.state
        with tf.variable_scope("shared"):
            for i, layer in enumerate(shared_net):
                self.shared = fullyConnected("layer_%i" % i, self.shared,
                                             layer, tf.nn.relu)

        self.action_ph = tf.placeholder(tf.int32, (None))
        self.grad_ph = tf.placeholder(tf.float32, (None, 1))
        self.value_ph = tf.placeholder(tf.float32, (None, 1))

        self.memory = GeneralMemory(memory_size, state_dim, 0, 1, state_dim,
                                    -1)
        self.memory_average = RunningAverage(.01)
        self.com_memory = GeneralMemory(memory_size, state_dim, 0, 1,
                                        state_dim, -1)
        self.com_average = RunningAverage(.01)

    def initializeActor(self, layers=[400, 300], optimizer=None, entropy=0.0):
        optimizer = optimizer or tf.train.AdamOptimizer(.0001)

        def _make():
            net = self.shared
            for i, layer in enumerate(layers):
                net = fullyConnected("layer_%i" % i, net, layer, tf.nn.relu,
                                     .01)
            actions = fullyConnected("actor_output", net, self.action_dim,
                                     tf.nn.softmax, .01)
            return actions

        with tf.variable_scope("actor"):
            self.actor = _make()

        row = tf.expand_dims(tf.range(0, tf.shape(self.actor)[0]), axis=1)
        col = tf.expand_dims(self.action_ph, axis=1)
        indexes = tf.concat([row, col], axis=1)
        action_probibility = tf.gather_nd(self.actor, indexes)

        self.single_loss = tf.log(action_probibility + 1e-8) * self.grad_ph
        self.actor_loss = -tf.reduce_sum(self.single_loss)
        if entropy != 0.0:
            self.actor_loss -= entropyLoss(self.actor) * entropy

        self.train_actor = optimizer.minimize(self.actor_loss)

    def initializeCritic(self, layers=[400, 300], optimizer=None):
        optimizer = optimizer or tf.train.AdamOptimizer(.01)

        def _make():
            net = self.shared
            for i, layer in enumerate(layers):
                net = fullyConnected("layer_%i" % i, net, layer, tf.nn.relu,
                                     .1)
            return fullyConnected("critic_output", net, 1, initializer=.001)

        with tf.variable_scope("critic"):
            self.critic = _make()

        self.critic_single_loss = tf.square(self.value_ph - self.critic)
        self.critic_loss = tf.reduce_mean(self.critic_single_loss)

        self.train_critic = optimizer.minimize(self.critic_loss)

    def train(self, session, batch_size, discount=.97):
        try:
            states, actions, rewards, next_states, terminals = self.memory.sample(
                batch_size)
        except ValueError:
            return 0
        next_state_value = session.run(self.critic, {self.state: next_states})
        state_value = session.run(self.critic, {self.state: states})
        observed_value = rewards + discount * next_state_value
        observed_value[terminals] = rewards[terminals] / (1 - discount)

        single_loss, loss, _ = session.run(
            [self.critic_single_loss, self.critic_loss, self.train_critic], {
                self.value_ph: observed_value,
                self.state: states
            })

        self.memory_average.update(loss)
        # tops = np.squeeze(single_loss > self.com_average.avg)
        # self.com_memory.addBatch(states[tops], actions[tops], rewards[
        #                          tops], next_states[tops], terminals[tops])

        _, actor_l = session.run(
            [self.train_actor, self.actor_loss], {
                self.state: states,
                self.action_ph: actions,
                self.grad_ph: observed_value - state_value
            })
        return loss  # , 0  # sum(tops)

    def train2(self, session, batch_size, discount=.97):
        states, actions, rewards, next_states, terminals = self.com_memory.sample(
            batch_size)
        next_state_value = session.run(self.critic, {self.state: next_states})
        # state_value = session.run(
        #     self.critic, {self.state: states})
        observed_value = rewards + discount * next_state_value
        observed_value[terminals] = rewards[terminals] / (1 - discount)

        com_single_loss, com_loss, _ = session.run(
            [self.critic_single_loss, self.critic_loss, self.train_critic], {
                self.value_ph: observed_value,
                self.state: states
            })
        self.com_average.update(com_loss)
        return com_loss

    def memorize(self, state, action, reward, next_state, terminal, session):
        next_state_value = session.run(self.critic,
                                       {self.state: [next_state]})[0]
        state_value = session.run(self.critic, {self.state: [state]})[0]
        observed_value = reward + .97 * next_state_value
        loss = np.square(observed_value - state_value)
        # print(loss, next_state_value, state_value, observed_value)
        # raise
        if loss > self.com_average.avg:
            self.com_memory.add(state, action, reward, next_state, terminal)
            return 1
        elif True:
            self.memory.add(state, action, reward, next_state, terminal)
        return 0