Ejemplo n.º 1
0
    def __init__(self,
                 name,
                 model,
                 learning_rate=0.01,
                 state_size=4,
                 action_size=2,
                 hidden_size=128,
                 batch_size=64,
                 context_size=32):

        with tf.variable_scope(name):
            self._model = model

            self._context_x = tf.placeholder(tf.float32,
                                             [None, context_size, state_size])
            self._context_y = tf.placeholder(tf.int32,
                                             [None, context_size, action_size])
            self._target_x = tf.placeholder(tf.float32, [None, state_size])

            self._query = (self._target_x)

            self._actions = tf.placeholder(tf.int32, [batch_size],
                                           name='actions')

            self.output = tf.keras.layers.Flatten()(self._target_x)
            self.output = tf.keras.layers.Dense(32,
                                                activation='relu')(self.output)
            self.output = tf.keras.layers.Dense(32,
                                                activation='relu')(self.output)
            self.output = tf.keras.layers.Dense(32,
                                                activation='relu')(self.output)
            self.output = tf.keras.layers.Dense(32,
                                                activation='relu')(self.output)
            self.output = tf.keras.layers.Dense(action_size,
                                                activation=None)(self.output)

            #self.output = supervised_snail(_target_x, 1, 256)

            #self.rep = tf.squeeze(tf.concat([self.mu, self.sigma], axis=1))

            #self.output = model(self._query, 1)

            self.name = name

            self._targetQs = tf.placeholder(tf.float32,
                                            [batch_size, action_size],
                                            name='target')
            self.reward = tf.placeholder(tf.float32, [batch_size],
                                         name='reward')
            self.discount = tf.constant(0.99,
                                        shape=[batch_size],
                                        dtype=tf.float32,
                                        name='discount')

            q_loss, q_learning = trfl.qlearning(self.output, self._actions,
                                                self.reward, self.discount,
                                                self._targetQs)
            self.loss = tf.reduce_mean(q_loss)
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)
Ejemplo n.º 2
0
    def __init__(self,
                 learning_rate=0.01,
                 state_size=4,
                 action_size=2,
                 hidden_size=10,
                 batch_size=20,
                 name='QNetwork'):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size],
                                          name='inputs')

            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [batch_size],
                                           name='actions')
            #one_hot_actions = tf.one_hot(self.actions_, action_size)

            # Target Q values for training
            #self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')

            # ReLU hidden layers
            self.fc1 = tf.contrib.layers.fully_connected(
                self.inputs_, hidden_size)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size)

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2,
                                                            action_size,
                                                            activation_fn=None)

            #Non trfl way from tutorial: https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb
            ### Train with loss (targetQ - Q)^2
            # output has length 2, for two actions. This next line chooses
            # one value from output (per row) according to the one-hot encoded actions.
            #             self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)

            #             self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            #             self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

            #TRFL way
            self.targetQs_ = tf.placeholder(tf.float32,
                                            [batch_size, action_size],
                                            name='target')
            self.reward = tf.placeholder(tf.float32, [batch_size],
                                         name="reward")
            self.discount = tf.constant(0.99,
                                        shape=[batch_size],
                                        dtype=tf.float32,
                                        name="discount")

            #TRFL qlearning
            qloss, q_learning = trfl.qlearning(self.output, self.actions_,
                                               self.reward, self.discount,
                                               self.targetQs_)
            self.loss = tf.reduce_mean(qloss)
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)
Ejemplo n.º 3
0
    def __init__(self,
                 name,
                 learning_rate=0.01,
                 state_size=4,
                 action_size=2,
                 hidden_size=128,
                 batch_size=64):

        with tf.variable_scope(name):

            #Input placeholder
            self._target_x = tf.placeholder(tf.float32, [None, state_size])

            # Action placeholder
            self._actions = tf.placeholder(tf.int32, [batch_size],
                                           name='actions')

            # Snail network. This is where all the work happens.
            self.output = supervised_snail(self._target_x, 1, hidden_size)
            self.output = tf.keras.layers.Dense(action_size,
                                                activation=None)(self.output)

            self.name = name

            self._targetQs = tf.placeholder(tf.float32,
                                            [batch_size, action_size],
                                            name='target')
            self.reward = tf.placeholder(tf.float32, [batch_size],
                                         name='reward')
            self.discount = tf.constant(0.99,
                                        shape=[batch_size],
                                        dtype=tf.float32,
                                        name='discount')

            q_loss, q_learning = trfl.qlearning(self.output, self._actions,
                                                self.reward, self.discount,
                                                self._targetQs)
            self.loss = tf.reduce_mean(q_loss)
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)
Ejemplo n.º 4
0
    def __init__(self,
                 name,
                 learning_rate=0.01,
                 state_size=[80, 80, 3],
                 action_size=6,
                 hidden_size=10,
                 batch_size=20):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(
                tf.float32,
                [None, state_size[0], state_size[1], state_size[2]],
                name='inputs')

            # Actions for the QNetwork:
            # One-hot vector, with each action being as follows:
            # (look_left, look_right, strafe_left, strafe_right, forward, backward)
            # These are mapped to the deepmind-lab (not one-hot) actions with the same names
            # defined in ACTIONS

            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [batch_size],
                                           name='actions')
            # one_hot_actions = tf.one_hot(self.actions_, action_size)

            # Target Q values for training
            # self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')

            # ReLU hidden layers
            self.conv1 = tf.contrib.layers.conv2d(self.inputs_,
                                                  output_filters_conv1,
                                                  kernel_size=8,
                                                  stride=2)
            self.conv2 = tf.contrib.layers.conv2d(self.conv1,
                                                  output_filters_conv2,
                                                  kernel_size=4,
                                                  stride=2)
            self.conv3 = tf.contrib.layers.conv2d(self.conv2,
                                                  output_filters_conv3,
                                                  kernel_size=4,
                                                  stride=1)

            self.fc1 = tf.contrib.layers.fully_connected( \
              tf.reshape(self.conv3, [-1, self.conv3.shape[1]*self.conv3.shape[2]*self.conv3.shape[3]]), \
              hidden_size)

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc1,
                                                            action_size,
                                                            activation_fn=None)

            # tf.summary.histogram("output", self.output)

            print("Network shapes:")
            print(self.conv1.shape)
            print(self.conv2.shape)
            print(self.conv3.shape)
            print(self.fc1.shape)
            print(self.output.shape)

            self.name = name

            #TRFL way
            self.targetQs_ = tf.placeholder(tf.float32,
                                            [batch_size, action_size],
                                            name='target')
            self.reward = tf.placeholder(tf.float32, [batch_size],
                                         name="reward")
            self.discount = tf.constant(gamma,
                                        shape=[batch_size],
                                        dtype=tf.float32,
                                        name="discount")

            #TRFL qlearning
            qloss, q_learning = trfl.qlearning(self.output, self.actions_,
                                               self.reward, self.discount,
                                               self.targetQs_)
            self.loss = tf.reduce_mean(qloss)
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)
Ejemplo n.º 5
0
    def _forward(self, inputs: Any) -> None:
        data = tree.map_structure(
            lambda v: tf.expand_dims(v, axis=0)
            if len(v.shape) <= 1 else v, inputs.data)
        data = tf2_utils.batch_to_sequence(data)

        observations, actions, rewards, discounts, _, extra = data

        core_state = tree.map_structure(lambda s: s[:, 0, :],
                                        inputs.data.extras["core_states"])
        core_message = tree.map_structure(lambda s: s[:, 0, :],
                                          inputs.data.extras["core_messages"])
        T = actions[self._agents[0]].shape[0]

        # Use fact that end of episode always has the reward to
        # find episode lengths. This is used to mask loss.
        ep_end = tf.argmax(tf.math.abs(rewards[self._agents[0]]), axis=0)

        with tf.GradientTape(persistent=True) as tape:
            q_network_losses: Dict[str, NestedArray] = {
                agent: {
                    "q_value_loss": tf.zeros(())
                }
                for agent in self._agents
            }

            state = {agent: core_state[agent][0] for agent in self._agents}
            target_state = {
                agent: core_state[agent][0]
                for agent in self._agents
            }

            message = {agent: core_message[agent][0] for agent in self._agents}
            target_message = {
                agent: core_message[agent][0]
                for agent in self._agents
            }

            # _target_q_networks must be 1 step ahead
            target_channel = self._communication_module.process_messages(
                target_message)
            for agent in self._agents:
                agent_key = self.agent_net_keys[agent]
                (q_targ, m), s = self._target_q_networks[agent_key](
                    observations[agent].observation[0],
                    target_state[agent],
                    target_channel[agent],
                )
                target_state[agent] = s
                target_message[agent] = m

            for t in range(1, T, 1):
                channel = self._communication_module.process_messages(message)
                target_channel = self._communication_module.process_messages(
                    target_message)

                for agent in self._agents:
                    agent_key = self.agent_net_keys[agent]

                    # Cast the additional discount
                    # to match the environment discount dtype.

                    discount = tf.cast(self._discount,
                                       dtype=discounts[agent][0].dtype)

                    (q_targ, m), s = self._target_q_networks[agent_key](
                        observations[agent].observation[t],
                        target_state[agent],
                        target_channel[agent],
                    )

                    target_state[agent] = s
                    target_message[agent] = tf.math.multiply(
                        m, observations[agent].observation[t][:, :1])

                    (q, m), s = self._q_networks[agent_key](
                        observations[agent].observation[t - 1],
                        state[agent],
                        channel[agent],
                    )

                    state[agent] = s
                    message[agent] = tf.math.multiply(
                        m, observations[agent].observation[t - 1][:, :1])

                    # Mask target
                    q_targ = tf.concat(
                        [[q_targ[i]]
                         if t <= ep_end[i] else [tf.zeros_like(q_targ[i])]
                         for i in range(q_targ.shape[0])],
                        axis=0,
                    )

                    loss, _ = trfl.qlearning(
                        q,
                        actions[agent][t - 1],
                        rewards[agent][t - 1],
                        discount * discounts[agent][t],
                        q_targ,
                    )

                    # Index loss (mask ended episodes)
                    if not tf.reduce_any(t - 1 <= ep_end):
                        continue

                    loss = tf.reduce_mean(loss[t - 1 <= ep_end])
                    # loss = tf.reduce_mean(loss)
                    q_network_losses[agent]["q_value_loss"] += loss

        self._q_network_losses = q_network_losses
        self.tape = tape
Ejemplo n.º 6
0
#!/usr/bin/env python
# coding:utf8

# pip install tensorflow # 1.8以上
# pip install git+git://github.com/deepmind/trfl.git
import tensorflow as tf
import trfl

# Q-values for the previous and next timesteps, shape [batch_size, num_actions].
q_tm1 = tf.get_variable("q_tm1",
                        initializer=[[1., 1., 0.], [1., 2., 0.]],
                        dtype=tf.float32)
q_t = tf.get_variable("q_t",
                      initializer=[[0., 1., 0.], [1., 2., 0.]],
                      dtype=tf.float32)

# Action indices, discounts and rewards, shape [batch_size].
a_tm1 = tf.constant([0, 1], dtype=tf.int32)
r_t = tf.constant([1, 1], dtype=tf.float32)
pcont_t = tf.constant([0, 1], dtype=tf.float32)  # the discount factor

# Q-learning loss, and auxiliary data.
loss, q_learning = trfl.qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t)

reduced_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(reduced_loss)
Ejemplo n.º 7
0
def q_learning(vision_model_dict, agent_model_dict, target_agent_model_dict,
               inputs, batch_size, kp_type, agent_size, mask_threshold,
               patch_sizes, kpt_encoder_type, mp_steps, img_size, lsp_layers,
               window_size, gamma, double_q, n_step_q):
    """
	:param vision_model_dict:
	:param agent_model_dict:
	:param target_agent_model_dict:
	:param inputs: bottom_up_kpt inputs [batch, T, dims]
	:param batch_size: (int)
	:param kp_type: (str) "transporter" or "permakey" type of keypoint used for bottom-up processing
	:param agent_size: (int) size of agent lstm
	:param mask_threshold: (float)
	:param patch_sizes: (int) size of patch size for "permakey" keypoints
	:param kpt_encoder_type: (str) "cnn" for conv-net "gnn" for graph-net
	:param mp_steps: (int) number of message-passing steps in GNNs
	:param img_size: (int) size of input image (H for H x H img)
	:param lsp_layers: (tuple) of layers for "permakey" keypoints
	:param window_size: (int) size of window used for recurrent q-learning
	:param gamma: (float) discount factor
	:param double_q: (bool) True if using double q-learning
	:param n_step_q: (int) 'n' value used for n-step q-learning
	:return:
	bottom_up_maps: keypoint gaussian masks
	bottom_up_features: bottom-up keypoint features
	"""

    # unpacking elements from sampled trajectories from buffer
    obses_tm1, a_tm1, r_t, dones = inputs[0][0], inputs[0][1], inputs[0][
        2], inputs[0][3]

    obses_tm1 = tf.cast(obses_tm1,
                        dtype=tf.float32) / 255.0  # (batch, T, H, W)

    # reshaping obs tensor (batch, T, H, W, C) -> (batch*T, H, W, C)
    obses_tm1_shape = obses_tm1.shape
    obses_tm1 = tf.reshape(obses_tm1, [
        obses_tm1_shape[0] * obses_tm1_shape[1], obses_tm1_shape[2],
        obses_tm1_shape[3], obses_tm1_shape[4]
    ])

    # 1 single forward pass of kpt-module for T-steps of frames
    vis_forward_start = time.time()
    bottom_up_maps, encoder_features, kpt_centers = vision_forward_pass(
        obses_tm1, vision_model_dict, lsp_layers, kp_type, patch_sizes,
        img_size)

    # reshaping tensors from (b*T, ...) -> (b, T, ...)
    bup_map_shape = bottom_up_maps.shape
    bottom_up_maps = tf.reshape(bottom_up_maps, [
        obses_tm1_shape[0], obses_tm1_shape[1], bup_map_shape[1],
        bup_map_shape[2], bup_map_shape[3]
    ])
    enc_feat_shape = encoder_features.shape
    encoder_features = tf.reshape(encoder_features, [
        obses_tm1_shape[0], obses_tm1_shape[1], enc_feat_shape[1],
        enc_feat_shape[2], enc_feat_shape[3]
    ])
    kpt_c_shape = kpt_centers.shape
    kpt_centers = tf.reshape(kpt_centers, [
        obses_tm1_shape[0], obses_tm1_shape[1], kpt_c_shape[1], kpt_c_shape[2]
    ])

    # splitting outputs into 2 parts  targets = (1:T) and qs = (0:T-1)
    bottom_up_maps_tm1, bottom_up_maps_t = bottom_up_maps[:, n_step_q:
                                                          -1, :, :, :], bottom_up_maps[:,
                                                                                       n_step_q
                                                                                       +
                                                                                       1:, :, :, :]
    encoder_features_tm1, encoder_features_t = encoder_features[:, n_step_q:
                                                                -1, :, :, :], encoder_features[:,
                                                                                               n_step_q
                                                                                               +
                                                                                               1:, :, :, :]
    kpt_centers_tm1, kpt_centers_t = kpt_centers[:, n_step_q:
                                                 -1, :, :], kpt_centers[:,
                                                                        n_step_q
                                                                        +
                                                                        1:, :, :]

    # collecting a_tm1, r_t and dones for n'th step bootstrapping
    a_tm1, r_t = tf.cast(a_tm1, dtype=tf.int32), tf.cast(r_t, dtype=tf.float32)
    a_tm1, r_t = a_tm1[:, n_step_q:-1, :], r_t[:, 0:-1, :]
    dones = tf.cast(dones, dtype=tf.float32)
    dones = dones[:, n_step_q + 1:, 1]  # dones for q_t's
    # switching batch and time axis to align all inputs i.e. (T, b, ..) -> (b, T, ..)
    a_tm1 = tf.transpose(a_tm1, perm=[1, 0, 2])
    dones = tf.transpose(dones, perm=[1, 0])

    # reshaping tensors again (ugh!) (b, T-1, ...) -> (b*(T-1), ...)
    bup_tm1_shape = bottom_up_maps_tm1.shape
    bottom_up_maps_tm1 = tf.reshape(
        bottom_up_maps_tm1,
        [-1, bup_tm1_shape[2], bup_tm1_shape[3], bup_tm1_shape[4]])
    bottom_up_maps_t = tf.reshape(bottom_up_maps_t, bottom_up_maps_tm1.shape)

    enc_tm1_shape = encoder_features_tm1.shape
    encoder_features_tm1 = tf.reshape(
        encoder_features_tm1,
        [-1, enc_tm1_shape[2], enc_tm1_shape[3], enc_tm1_shape[4]])
    encoder_features_t = tf.reshape(encoder_features_t,
                                    encoder_features_tm1.shape)

    kptc_tm1_shape = kpt_centers_tm1.shape
    kpt_centers_tm1 = tf.reshape(kpt_centers_tm1,
                                 [-1, kptc_tm1_shape[2], kptc_tm1_shape[3]])
    kpt_centers_t = tf.reshape(kpt_centers_t, kpt_centers_tm1.shape)

    # compute keypoint encodings
    kpts_features_tm1 = encode_keypoints(
        bottom_up_maps_tm1,
        encoder_features_tm1,
        kpt_centers_tm1,
        mask_threshold,
        kp_type,
        kpt_encoder_type,
        mp_steps,
        True,
        pos_net=agent_model_dict.get("pos_net"),
        kpt_encoder=agent_model_dict.get("kpt_encoder"),
        node_encoder=agent_model_dict.get(
            "node_enc"))  # passes none if not available

    kpts_features_t = encode_keypoints(
        bottom_up_maps_t,
        encoder_features_t,
        kpt_centers_t,
        mask_threshold,
        kp_type,
        kpt_encoder_type,
        mp_steps,
        True,
        pos_net=target_agent_model_dict.get("pos_net"),
        kpt_encoder=target_agent_model_dict.get("kpt_encoder"),
        node_encoder=target_agent_model_dict.get(
            "node_enc"))  # passes none if not available

    # reshaping back the time axis (b*T, dims) -> (b, T, dims)
    kpts_features_tm1 = tf.expand_dims(kpts_features_tm1, axis=1)
    kpts_tm1_shape = kpts_features_tm1.shape
    kpts_features_tm1 = tf.reshape(
        kpts_features_tm1, [batch_size, window_size, kpts_tm1_shape[-1]])

    kpts_features_t = tf.expand_dims(kpts_features_t, axis=1)
    kpts_t_shape = kpts_features_t.shape
    kpts_features_t = tf.reshape(kpts_features_t,
                                 [batch_size, window_size, kpts_t_shape[-1]])

    # RNN computation
    q_tm1_seq = []
    q_t_seq = []
    q_t_selector_seq = []

    # reset lstm state at start of update as in R-DQN random updates
    c_tm1 = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True)
    h_tm1 = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True)
    h_t_sel = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True)
    c_t_sel = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True)
    h_t = tf.Variable(tf.zeros((batch_size, agent_size)),
                      trainable=False)  # td_targets
    c_t = tf.Variable(tf.zeros((batch_size, agent_size)),
                      trainable=False)  # td_targets
    rnn_unroll_start = time.time()

    # RNN unrolling
    for seq_idx in tf.range(window_size):
        s_tm1 = kpts_features_tm1[:, seq_idx, :]
        s_t = kpts_features_t[:, seq_idx, :]
        # double_q action selection step
        if double_q:
            q_t_selector, h_t_sel, c_t_sel = agent_model_dict["agent_net"](
                s_t, [h_t_sel, c_t_sel], training=True)
            q_t_selector_seq.append(q_t_selector)

        q_tm1, h_tm1, c_tm1 = agent_model_dict["agent_net"](s_tm1,
                                                            [h_tm1, c_tm1],
                                                            training=True)
        q_tm1_seq.append(q_tm1)
        q_t, h_t, c_t = target_agent_model_dict["agent_net"](s_t, [h_t, c_t],
                                                             training=False)
        q_t_seq.append(q_t)
    # print("RNN for loop unrolling took %s" % (time.time() - rnn_unroll_start))

    q_tm1 = tf.convert_to_tensor(q_tm1_seq, dtype=tf.float32)
    q_t = tf.convert_to_tensor(q_t_seq, dtype=tf.float32)

    # compute cumm. rew for 'n' steps
    if n_step_q > 1:
        l = tf.constant(np.array(list(range(n_step_q))), dtype=tf.float32)
        discounts = tf.math.pow(gamma, l)
        # slice r_t [b, T] into moving windows of [b, t-k, k]  # cumsum over k steps
        r_t = tf.transpose(r_t, perm=[1, 0, 2])
        r_t_sliced = tf.convert_to_tensor(
            [r_t[t:t + n_step_q, :, :] for t in range(window_size)],
            dtype=tf.float32)
        r_t_sliced = tf.squeeze(tf.transpose(r_t_sliced, perm=[0, 2, 1, 3]))
        r_t_sl_shape = r_t_sliced.shape
        # reshape (batch, T, n) -> (batch*T, n)
        r_t_sliced = tf.reshape(
            r_t_sliced, [r_t_sl_shape[0] * r_t_sl_shape[1], r_t_sl_shape[2]])
        # r_t_slices [T*batch, n_steps] x  discounts [n_steps, 1]
        r_t = tf.linalg.matvec(r_t_sliced, discounts)
        r_t = tf.reshape(r_t, [r_t_sl_shape[0], r_t_sl_shape[1]])

    # reshape again to make tensors compatible with trfl API
    q_tm1_shape = q_tm1.shape
    q_tm1 = tf.reshape(q_tm1,
                       [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]])
    q_t = tf.reshape(q_t, [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]])
    a_tm1_shape = a_tm1.shape
    a_tm1 = tf.squeeze(
        tf.reshape(a_tm1, [a_tm1_shape[0] * a_tm1_shape[1], a_tm1_shape[2]]))
    r_t_shape = r_t.shape
    r_t = tf.reshape(r_t, [r_t_shape[0] * r_t_shape[1]])
    dones_shape = dones.shape
    dones = tf.reshape(dones, [dones_shape[0] * dones_shape[1]])

    p_cont = 0.0
    if n_step_q == 1:
        # discount factor (at t=1) for bootstrapped value
        p_cont = tf.math.multiply(tf.ones((dones.shape)) - dones, gamma)
    elif n_step_q > 1:
        # discount factor (at t=n+1) accordingly for bootstrapped value
        p_cont = tf.math.multiply(
            tf.ones((dones.shape)) - dones, tf.math.pow(gamma, n_step_q))

    loss, extra = 0.0, None
    if not double_q:
        loss, extra = trfl.qlearning(q_tm1, a_tm1, r_t, p_cont, q_t)
    elif double_q:
        q_t_selector = tf.convert_to_tensor(q_t_selector_seq, dtype=tf.float32)
        q_t_selector = tf.reshape(
            q_t_selector, [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]])
        loss, extra = trfl.double_qlearning(q_tm1, a_tm1, r_t, p_cont, q_t,
                                            q_t_selector)

    # average over batch_dim = (batch*time)
    loss = tf.reduce_mean(loss, axis=0)
    # print("Inside q_learning bellman updates took %4.5f" % (time.time() - q_backup_start))
    return loss, extra
Ejemplo n.º 8
0
#!/usr/bin/env python
# coding:utf8

# pip install tensorflow # 1.8以上
# pip install git+git://github.com/deepmind/trfl.git
import tensorflow as tf
import trfl

# Q-values for the previous and next timesteps, shape [batch_size, num_actions].
q_tm1 = tf.get_variable(
    "q_tm1", initializer=[[1., 1., 0.], [1., 2., 0.]], dtype=tf.float32)
q_t = tf.get_variable(
    "q_t", initializer=[[0., 1., 0.], [1., 2., 0.]], dtype=tf.float32)

# Action indices, discounts and rewards, shape [batch_size].
a_tm1 = tf.constant([0, 1], dtype=tf.int32)
r_t = tf.constant([1, 1], dtype=tf.float32)
pcont_t = tf.constant([0, 1], dtype=tf.float32)  # the discount factor

# Q-learning loss, and auxiliary data.
loss, q_learning = trfl.qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t)

reduced_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(reduced_loss)