def __init__(self, name, model, learning_rate=0.01, state_size=4, action_size=2, hidden_size=128, batch_size=64, context_size=32): with tf.variable_scope(name): self._model = model self._context_x = tf.placeholder(tf.float32, [None, context_size, state_size]) self._context_y = tf.placeholder(tf.int32, [None, context_size, action_size]) self._target_x = tf.placeholder(tf.float32, [None, state_size]) self._query = (self._target_x) self._actions = tf.placeholder(tf.int32, [batch_size], name='actions') self.output = tf.keras.layers.Flatten()(self._target_x) self.output = tf.keras.layers.Dense(32, activation='relu')(self.output) self.output = tf.keras.layers.Dense(32, activation='relu')(self.output) self.output = tf.keras.layers.Dense(32, activation='relu')(self.output) self.output = tf.keras.layers.Dense(32, activation='relu')(self.output) self.output = tf.keras.layers.Dense(action_size, activation=None)(self.output) #self.output = supervised_snail(_target_x, 1, 256) #self.rep = tf.squeeze(tf.concat([self.mu, self.sigma], axis=1)) #self.output = model(self._query, 1) self.name = name self._targetQs = tf.placeholder(tf.float32, [batch_size, action_size], name='target') self.reward = tf.placeholder(tf.float32, [batch_size], name='reward') self.discount = tf.constant(0.99, shape=[batch_size], dtype=tf.float32, name='discount') q_loss, q_learning = trfl.qlearning(self.output, self._actions, self.reward, self.discount, self._targetQs) self.loss = tf.reduce_mean(q_loss) self.opt = tf.train.AdamOptimizer(learning_rate).minimize( self.loss)
def __init__(self, learning_rate=0.01, state_size=4, action_size=2, hidden_size=10, batch_size=20, name='QNetwork'): # state inputs to the Q-network with tf.variable_scope(name): self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs') # One hot encode the actions to later choose the Q-value for the action self.actions_ = tf.placeholder(tf.int32, [batch_size], name='actions') #one_hot_actions = tf.one_hot(self.actions_, action_size) # Target Q values for training #self.targetQs_ = tf.placeholder(tf.float32, [None], name='target') # ReLU hidden layers self.fc1 = tf.contrib.layers.fully_connected( self.inputs_, hidden_size) self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size) # Linear output layer self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, activation_fn=None) #Non trfl way from tutorial: https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb ### Train with loss (targetQ - Q)^2 # output has length 2, for two actions. This next line chooses # one value from output (per row) according to the one-hot encoded actions. # self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1) # self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q)) # self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) #TRFL way self.targetQs_ = tf.placeholder(tf.float32, [batch_size, action_size], name='target') self.reward = tf.placeholder(tf.float32, [batch_size], name="reward") self.discount = tf.constant(0.99, shape=[batch_size], dtype=tf.float32, name="discount") #TRFL qlearning qloss, q_learning = trfl.qlearning(self.output, self.actions_, self.reward, self.discount, self.targetQs_) self.loss = tf.reduce_mean(qloss) self.opt = tf.train.AdamOptimizer(learning_rate).minimize( self.loss)
def __init__(self, name, learning_rate=0.01, state_size=4, action_size=2, hidden_size=128, batch_size=64): with tf.variable_scope(name): #Input placeholder self._target_x = tf.placeholder(tf.float32, [None, state_size]) # Action placeholder self._actions = tf.placeholder(tf.int32, [batch_size], name='actions') # Snail network. This is where all the work happens. self.output = supervised_snail(self._target_x, 1, hidden_size) self.output = tf.keras.layers.Dense(action_size, activation=None)(self.output) self.name = name self._targetQs = tf.placeholder(tf.float32, [batch_size, action_size], name='target') self.reward = tf.placeholder(tf.float32, [batch_size], name='reward') self.discount = tf.constant(0.99, shape=[batch_size], dtype=tf.float32, name='discount') q_loss, q_learning = trfl.qlearning(self.output, self._actions, self.reward, self.discount, self._targetQs) self.loss = tf.reduce_mean(q_loss) self.opt = tf.train.AdamOptimizer(learning_rate).minimize( self.loss)
def __init__(self, name, learning_rate=0.01, state_size=[80, 80, 3], action_size=6, hidden_size=10, batch_size=20): # state inputs to the Q-network with tf.variable_scope(name): self.inputs_ = tf.placeholder( tf.float32, [None, state_size[0], state_size[1], state_size[2]], name='inputs') # Actions for the QNetwork: # One-hot vector, with each action being as follows: # (look_left, look_right, strafe_left, strafe_right, forward, backward) # These are mapped to the deepmind-lab (not one-hot) actions with the same names # defined in ACTIONS # One hot encode the actions to later choose the Q-value for the action self.actions_ = tf.placeholder(tf.int32, [batch_size], name='actions') # one_hot_actions = tf.one_hot(self.actions_, action_size) # Target Q values for training # self.targetQs_ = tf.placeholder(tf.float32, [None], name='target') # ReLU hidden layers self.conv1 = tf.contrib.layers.conv2d(self.inputs_, output_filters_conv1, kernel_size=8, stride=2) self.conv2 = tf.contrib.layers.conv2d(self.conv1, output_filters_conv2, kernel_size=4, stride=2) self.conv3 = tf.contrib.layers.conv2d(self.conv2, output_filters_conv3, kernel_size=4, stride=1) self.fc1 = tf.contrib.layers.fully_connected( \ tf.reshape(self.conv3, [-1, self.conv3.shape[1]*self.conv3.shape[2]*self.conv3.shape[3]]), \ hidden_size) # Linear output layer self.output = tf.contrib.layers.fully_connected(self.fc1, action_size, activation_fn=None) # tf.summary.histogram("output", self.output) print("Network shapes:") print(self.conv1.shape) print(self.conv2.shape) print(self.conv3.shape) print(self.fc1.shape) print(self.output.shape) self.name = name #TRFL way self.targetQs_ = tf.placeholder(tf.float32, [batch_size, action_size], name='target') self.reward = tf.placeholder(tf.float32, [batch_size], name="reward") self.discount = tf.constant(gamma, shape=[batch_size], dtype=tf.float32, name="discount") #TRFL qlearning qloss, q_learning = trfl.qlearning(self.output, self.actions_, self.reward, self.discount, self.targetQs_) self.loss = tf.reduce_mean(qloss) self.opt = tf.train.AdamOptimizer(learning_rate).minimize( self.loss)
def _forward(self, inputs: Any) -> None: data = tree.map_structure( lambda v: tf.expand_dims(v, axis=0) if len(v.shape) <= 1 else v, inputs.data) data = tf2_utils.batch_to_sequence(data) observations, actions, rewards, discounts, _, extra = data core_state = tree.map_structure(lambda s: s[:, 0, :], inputs.data.extras["core_states"]) core_message = tree.map_structure(lambda s: s[:, 0, :], inputs.data.extras["core_messages"]) T = actions[self._agents[0]].shape[0] # Use fact that end of episode always has the reward to # find episode lengths. This is used to mask loss. ep_end = tf.argmax(tf.math.abs(rewards[self._agents[0]]), axis=0) with tf.GradientTape(persistent=True) as tape: q_network_losses: Dict[str, NestedArray] = { agent: { "q_value_loss": tf.zeros(()) } for agent in self._agents } state = {agent: core_state[agent][0] for agent in self._agents} target_state = { agent: core_state[agent][0] for agent in self._agents } message = {agent: core_message[agent][0] for agent in self._agents} target_message = { agent: core_message[agent][0] for agent in self._agents } # _target_q_networks must be 1 step ahead target_channel = self._communication_module.process_messages( target_message) for agent in self._agents: agent_key = self.agent_net_keys[agent] (q_targ, m), s = self._target_q_networks[agent_key]( observations[agent].observation[0], target_state[agent], target_channel[agent], ) target_state[agent] = s target_message[agent] = m for t in range(1, T, 1): channel = self._communication_module.process_messages(message) target_channel = self._communication_module.process_messages( target_message) for agent in self._agents: agent_key = self.agent_net_keys[agent] # Cast the additional discount # to match the environment discount dtype. discount = tf.cast(self._discount, dtype=discounts[agent][0].dtype) (q_targ, m), s = self._target_q_networks[agent_key]( observations[agent].observation[t], target_state[agent], target_channel[agent], ) target_state[agent] = s target_message[agent] = tf.math.multiply( m, observations[agent].observation[t][:, :1]) (q, m), s = self._q_networks[agent_key]( observations[agent].observation[t - 1], state[agent], channel[agent], ) state[agent] = s message[agent] = tf.math.multiply( m, observations[agent].observation[t - 1][:, :1]) # Mask target q_targ = tf.concat( [[q_targ[i]] if t <= ep_end[i] else [tf.zeros_like(q_targ[i])] for i in range(q_targ.shape[0])], axis=0, ) loss, _ = trfl.qlearning( q, actions[agent][t - 1], rewards[agent][t - 1], discount * discounts[agent][t], q_targ, ) # Index loss (mask ended episodes) if not tf.reduce_any(t - 1 <= ep_end): continue loss = tf.reduce_mean(loss[t - 1 <= ep_end]) # loss = tf.reduce_mean(loss) q_network_losses[agent]["q_value_loss"] += loss self._q_network_losses = q_network_losses self.tape = tape
#!/usr/bin/env python # coding:utf8 # pip install tensorflow # 1.8以上 # pip install git+git://github.com/deepmind/trfl.git import tensorflow as tf import trfl # Q-values for the previous and next timesteps, shape [batch_size, num_actions]. q_tm1 = tf.get_variable("q_tm1", initializer=[[1., 1., 0.], [1., 2., 0.]], dtype=tf.float32) q_t = tf.get_variable("q_t", initializer=[[0., 1., 0.], [1., 2., 0.]], dtype=tf.float32) # Action indices, discounts and rewards, shape [batch_size]. a_tm1 = tf.constant([0, 1], dtype=tf.int32) r_t = tf.constant([1, 1], dtype=tf.float32) pcont_t = tf.constant([0, 1], dtype=tf.float32) # the discount factor # Q-learning loss, and auxiliary data. loss, q_learning = trfl.qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t) reduced_loss = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer(learning_rate=0.1) train_op = optimizer.minimize(reduced_loss)
def q_learning(vision_model_dict, agent_model_dict, target_agent_model_dict, inputs, batch_size, kp_type, agent_size, mask_threshold, patch_sizes, kpt_encoder_type, mp_steps, img_size, lsp_layers, window_size, gamma, double_q, n_step_q): """ :param vision_model_dict: :param agent_model_dict: :param target_agent_model_dict: :param inputs: bottom_up_kpt inputs [batch, T, dims] :param batch_size: (int) :param kp_type: (str) "transporter" or "permakey" type of keypoint used for bottom-up processing :param agent_size: (int) size of agent lstm :param mask_threshold: (float) :param patch_sizes: (int) size of patch size for "permakey" keypoints :param kpt_encoder_type: (str) "cnn" for conv-net "gnn" for graph-net :param mp_steps: (int) number of message-passing steps in GNNs :param img_size: (int) size of input image (H for H x H img) :param lsp_layers: (tuple) of layers for "permakey" keypoints :param window_size: (int) size of window used for recurrent q-learning :param gamma: (float) discount factor :param double_q: (bool) True if using double q-learning :param n_step_q: (int) 'n' value used for n-step q-learning :return: bottom_up_maps: keypoint gaussian masks bottom_up_features: bottom-up keypoint features """ # unpacking elements from sampled trajectories from buffer obses_tm1, a_tm1, r_t, dones = inputs[0][0], inputs[0][1], inputs[0][ 2], inputs[0][3] obses_tm1 = tf.cast(obses_tm1, dtype=tf.float32) / 255.0 # (batch, T, H, W) # reshaping obs tensor (batch, T, H, W, C) -> (batch*T, H, W, C) obses_tm1_shape = obses_tm1.shape obses_tm1 = tf.reshape(obses_tm1, [ obses_tm1_shape[0] * obses_tm1_shape[1], obses_tm1_shape[2], obses_tm1_shape[3], obses_tm1_shape[4] ]) # 1 single forward pass of kpt-module for T-steps of frames vis_forward_start = time.time() bottom_up_maps, encoder_features, kpt_centers = vision_forward_pass( obses_tm1, vision_model_dict, lsp_layers, kp_type, patch_sizes, img_size) # reshaping tensors from (b*T, ...) -> (b, T, ...) bup_map_shape = bottom_up_maps.shape bottom_up_maps = tf.reshape(bottom_up_maps, [ obses_tm1_shape[0], obses_tm1_shape[1], bup_map_shape[1], bup_map_shape[2], bup_map_shape[3] ]) enc_feat_shape = encoder_features.shape encoder_features = tf.reshape(encoder_features, [ obses_tm1_shape[0], obses_tm1_shape[1], enc_feat_shape[1], enc_feat_shape[2], enc_feat_shape[3] ]) kpt_c_shape = kpt_centers.shape kpt_centers = tf.reshape(kpt_centers, [ obses_tm1_shape[0], obses_tm1_shape[1], kpt_c_shape[1], kpt_c_shape[2] ]) # splitting outputs into 2 parts targets = (1:T) and qs = (0:T-1) bottom_up_maps_tm1, bottom_up_maps_t = bottom_up_maps[:, n_step_q: -1, :, :, :], bottom_up_maps[:, n_step_q + 1:, :, :, :] encoder_features_tm1, encoder_features_t = encoder_features[:, n_step_q: -1, :, :, :], encoder_features[:, n_step_q + 1:, :, :, :] kpt_centers_tm1, kpt_centers_t = kpt_centers[:, n_step_q: -1, :, :], kpt_centers[:, n_step_q + 1:, :, :] # collecting a_tm1, r_t and dones for n'th step bootstrapping a_tm1, r_t = tf.cast(a_tm1, dtype=tf.int32), tf.cast(r_t, dtype=tf.float32) a_tm1, r_t = a_tm1[:, n_step_q:-1, :], r_t[:, 0:-1, :] dones = tf.cast(dones, dtype=tf.float32) dones = dones[:, n_step_q + 1:, 1] # dones for q_t's # switching batch and time axis to align all inputs i.e. (T, b, ..) -> (b, T, ..) a_tm1 = tf.transpose(a_tm1, perm=[1, 0, 2]) dones = tf.transpose(dones, perm=[1, 0]) # reshaping tensors again (ugh!) (b, T-1, ...) -> (b*(T-1), ...) bup_tm1_shape = bottom_up_maps_tm1.shape bottom_up_maps_tm1 = tf.reshape( bottom_up_maps_tm1, [-1, bup_tm1_shape[2], bup_tm1_shape[3], bup_tm1_shape[4]]) bottom_up_maps_t = tf.reshape(bottom_up_maps_t, bottom_up_maps_tm1.shape) enc_tm1_shape = encoder_features_tm1.shape encoder_features_tm1 = tf.reshape( encoder_features_tm1, [-1, enc_tm1_shape[2], enc_tm1_shape[3], enc_tm1_shape[4]]) encoder_features_t = tf.reshape(encoder_features_t, encoder_features_tm1.shape) kptc_tm1_shape = kpt_centers_tm1.shape kpt_centers_tm1 = tf.reshape(kpt_centers_tm1, [-1, kptc_tm1_shape[2], kptc_tm1_shape[3]]) kpt_centers_t = tf.reshape(kpt_centers_t, kpt_centers_tm1.shape) # compute keypoint encodings kpts_features_tm1 = encode_keypoints( bottom_up_maps_tm1, encoder_features_tm1, kpt_centers_tm1, mask_threshold, kp_type, kpt_encoder_type, mp_steps, True, pos_net=agent_model_dict.get("pos_net"), kpt_encoder=agent_model_dict.get("kpt_encoder"), node_encoder=agent_model_dict.get( "node_enc")) # passes none if not available kpts_features_t = encode_keypoints( bottom_up_maps_t, encoder_features_t, kpt_centers_t, mask_threshold, kp_type, kpt_encoder_type, mp_steps, True, pos_net=target_agent_model_dict.get("pos_net"), kpt_encoder=target_agent_model_dict.get("kpt_encoder"), node_encoder=target_agent_model_dict.get( "node_enc")) # passes none if not available # reshaping back the time axis (b*T, dims) -> (b, T, dims) kpts_features_tm1 = tf.expand_dims(kpts_features_tm1, axis=1) kpts_tm1_shape = kpts_features_tm1.shape kpts_features_tm1 = tf.reshape( kpts_features_tm1, [batch_size, window_size, kpts_tm1_shape[-1]]) kpts_features_t = tf.expand_dims(kpts_features_t, axis=1) kpts_t_shape = kpts_features_t.shape kpts_features_t = tf.reshape(kpts_features_t, [batch_size, window_size, kpts_t_shape[-1]]) # RNN computation q_tm1_seq = [] q_t_seq = [] q_t_selector_seq = [] # reset lstm state at start of update as in R-DQN random updates c_tm1 = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True) h_tm1 = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True) h_t_sel = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True) c_t_sel = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=True) h_t = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=False) # td_targets c_t = tf.Variable(tf.zeros((batch_size, agent_size)), trainable=False) # td_targets rnn_unroll_start = time.time() # RNN unrolling for seq_idx in tf.range(window_size): s_tm1 = kpts_features_tm1[:, seq_idx, :] s_t = kpts_features_t[:, seq_idx, :] # double_q action selection step if double_q: q_t_selector, h_t_sel, c_t_sel = agent_model_dict["agent_net"]( s_t, [h_t_sel, c_t_sel], training=True) q_t_selector_seq.append(q_t_selector) q_tm1, h_tm1, c_tm1 = agent_model_dict["agent_net"](s_tm1, [h_tm1, c_tm1], training=True) q_tm1_seq.append(q_tm1) q_t, h_t, c_t = target_agent_model_dict["agent_net"](s_t, [h_t, c_t], training=False) q_t_seq.append(q_t) # print("RNN for loop unrolling took %s" % (time.time() - rnn_unroll_start)) q_tm1 = tf.convert_to_tensor(q_tm1_seq, dtype=tf.float32) q_t = tf.convert_to_tensor(q_t_seq, dtype=tf.float32) # compute cumm. rew for 'n' steps if n_step_q > 1: l = tf.constant(np.array(list(range(n_step_q))), dtype=tf.float32) discounts = tf.math.pow(gamma, l) # slice r_t [b, T] into moving windows of [b, t-k, k] # cumsum over k steps r_t = tf.transpose(r_t, perm=[1, 0, 2]) r_t_sliced = tf.convert_to_tensor( [r_t[t:t + n_step_q, :, :] for t in range(window_size)], dtype=tf.float32) r_t_sliced = tf.squeeze(tf.transpose(r_t_sliced, perm=[0, 2, 1, 3])) r_t_sl_shape = r_t_sliced.shape # reshape (batch, T, n) -> (batch*T, n) r_t_sliced = tf.reshape( r_t_sliced, [r_t_sl_shape[0] * r_t_sl_shape[1], r_t_sl_shape[2]]) # r_t_slices [T*batch, n_steps] x discounts [n_steps, 1] r_t = tf.linalg.matvec(r_t_sliced, discounts) r_t = tf.reshape(r_t, [r_t_sl_shape[0], r_t_sl_shape[1]]) # reshape again to make tensors compatible with trfl API q_tm1_shape = q_tm1.shape q_tm1 = tf.reshape(q_tm1, [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]]) q_t = tf.reshape(q_t, [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]]) a_tm1_shape = a_tm1.shape a_tm1 = tf.squeeze( tf.reshape(a_tm1, [a_tm1_shape[0] * a_tm1_shape[1], a_tm1_shape[2]])) r_t_shape = r_t.shape r_t = tf.reshape(r_t, [r_t_shape[0] * r_t_shape[1]]) dones_shape = dones.shape dones = tf.reshape(dones, [dones_shape[0] * dones_shape[1]]) p_cont = 0.0 if n_step_q == 1: # discount factor (at t=1) for bootstrapped value p_cont = tf.math.multiply(tf.ones((dones.shape)) - dones, gamma) elif n_step_q > 1: # discount factor (at t=n+1) accordingly for bootstrapped value p_cont = tf.math.multiply( tf.ones((dones.shape)) - dones, tf.math.pow(gamma, n_step_q)) loss, extra = 0.0, None if not double_q: loss, extra = trfl.qlearning(q_tm1, a_tm1, r_t, p_cont, q_t) elif double_q: q_t_selector = tf.convert_to_tensor(q_t_selector_seq, dtype=tf.float32) q_t_selector = tf.reshape( q_t_selector, [q_tm1_shape[0] * q_tm1_shape[1], q_tm1_shape[2]]) loss, extra = trfl.double_qlearning(q_tm1, a_tm1, r_t, p_cont, q_t, q_t_selector) # average over batch_dim = (batch*time) loss = tf.reduce_mean(loss, axis=0) # print("Inside q_learning bellman updates took %4.5f" % (time.time() - q_backup_start)) return loss, extra
#!/usr/bin/env python # coding:utf8 # pip install tensorflow # 1.8以上 # pip install git+git://github.com/deepmind/trfl.git import tensorflow as tf import trfl # Q-values for the previous and next timesteps, shape [batch_size, num_actions]. q_tm1 = tf.get_variable( "q_tm1", initializer=[[1., 1., 0.], [1., 2., 0.]], dtype=tf.float32) q_t = tf.get_variable( "q_t", initializer=[[0., 1., 0.], [1., 2., 0.]], dtype=tf.float32) # Action indices, discounts and rewards, shape [batch_size]. a_tm1 = tf.constant([0, 1], dtype=tf.int32) r_t = tf.constant([1, 1], dtype=tf.float32) pcont_t = tf.constant([0, 1], dtype=tf.float32) # the discount factor # Q-learning loss, and auxiliary data. loss, q_learning = trfl.qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t) reduced_loss = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer(learning_rate=0.1) train_op = optimizer.minimize(reduced_loss)