def build_graph(self, graph): self.env.seed(self.random_seed) np.random.seed(self.random_seed) with graph.as_default(): tf.set_random_seed(self.random_seed) # Graph of the LSTM model of the world input_scope = tf.VariableScope(reuse=False, name="inputs") with tf.variable_scope(input_scope): self.state_input_plh = tf.placeholder( tf.float32, shape=[None, None, self.m_params['env_state_size']], name='state_input_plh') self.action_input_plh = tf.placeholder(tf.int32, shape=[None, None, 1], name='action_input_plh') self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh") input_shape = tf.shape(self.state_input_plh) dynamic_batch_size, dynamic_num_steps = input_shape[ 0], input_shape[1] action_input = tf.one_hot(indices=tf.squeeze( self.action_input_plh, 2), depth=self.m_params['nb_actions']) m_inputs = tf.concat([self.state_input_plh, action_input], 2, name="m_inputs") m_scope = tf.VariableScope(reuse=False, name="m") with tf.variable_scope(m_scope): self.state_reward_preds, self.m_final_state, self.m_initial_state = capacities.predictive_model( self.m_params, m_inputs, dynamic_batch_size, None, summary_collections=[self.M_SUMMARIES]) fixed_m_scope = tf.VariableScope(reuse=False, name='FixedM') with tf.variable_scope(fixed_m_scope): self.update_m_fixed_vars_op = capacities.fix_scope(m_scope) m_training_scope = tf.VariableScope(reuse=False, name='m_training') with tf.variable_scope(m_training_scope): self.m_next_states = tf.placeholder( tf.float32, shape=[None, None, self.m_params['env_state_size']], name="m_next_states") self.m_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="m_rewards") y_true = tf.concat([self.m_rewards, self.m_next_states], 2) with tf.control_dependencies([self.state_reward_preds]): self.m_loss = 1 / 2 * tf.reduce_mean( tf.square(self.state_reward_preds - y_true) * self.mask_plh) tf.summary.scalar('m_loss', self.m_loss, collections=[self.M_SUMMARIES]) m_adam = tf.train.AdamOptimizer(self.m_params['lr']) self.m_global_step = tf.Variable(0, trainable=False, name="m_global_step") tf.summary.scalar('m_global_step', self.m_global_step, collections=[self.M_SUMMARIES]) self.m_train_op = m_adam.minimize( self.m_loss, global_step=self.m_global_step) self.all_m_summary_t = tf.summary.merge_all(key=self.M_SUMMARIES) # Graph of the controller c_scope = tf.VariableScope(reuse=False, name="c") c_summary_collection = [self.C_SUMMARIES] with tf.variable_scope(c_scope): # c_cell = LSTMCell( # num_units=self.c_params['nb_units'] # , initializer=tf.truncated_normal_initializer( # mean=self.c_params['initial_mean'] # , stddev=self.c_params['initial_stddev'] # ) # ) # self.c_initial_state = c_cell.zero_state(dynamic_batch_size, dtype=tf.float32) # c_c_h_states, self.c_final_state = tf.nn.dynamic_rnn(c_cell, self.state_input_plh, initial_state=self.c_initial_state) # c_c_states, c_h_states = tf.split(value=c_c_h_states, num_or_size_splits=[self.c_params['nb_units'], self.c_params['nb_units']], axis=2) # # Compute the Controller projection # self.probs_t, self.actions_t = projection_func(c_h_states) m_params = self.m_params model_func = lambda m_inputs, m_state: capacities.predictive_model( m_params, m_inputs, dynamic_batch_size, m_state) c_params = self.c_params projection_func = lambda inputs: capacities.projection( c_params, inputs) cm_cell = CMCell(num_units=self.c_params['nb_units'], m_units=self.m_params['nb_units'], fixed_model_scope=fixed_m_scope, model_func=model_func, projection_func=projection_func, num_proj=self.c_params['nb_actions'], initializer=tf.truncated_normal_initializer( mean=self.c_params['initial_mean'], stddev=self.c_params['initial_stddev'])) self.cm_initial_state = cm_cell.zero_state(dynamic_batch_size, dtype=tf.float32) probs_and_actions_t, self.cm_final_state = tf.nn.dynamic_rnn( cm_cell, self.state_input_plh, initial_state=self.cm_initial_state) self.probs_t, actions_t = tf.split( value=probs_and_actions_t, num_or_size_splits=[self.c_params['nb_actions'], 1], axis=2) self.actions_t = tf.cast(actions_t, tf.int32) # helper tensor used for inference self.action_t = self.actions_t[0, 0, 0] c_training_scope = tf.VariableScope(reuse=False, name='c_training') with tf.variable_scope(c_training_scope): self.c_rewards_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="c_rewards_plh") baseline = tf.reduce_mean(self.c_rewards_plh) batch_size, num_steps = tf.shape(self.actions_t)[0], tf.shape( self.actions_t)[1] line_indices = tf.matmul( # Line indice tf.reshape(tf.range(0, batch_size), [-1, 1]), tf.ones([1, num_steps], dtype=tf.int32)) column_indices = tf.matmul( # Column indice tf.ones([batch_size, 1], dtype=tf.int32), tf.reshape(tf.range(0, num_steps), [1, -1])) depth_indices = tf.squeeze(self.actions_t, 2) stacked_actions = tf.stack( [line_indices, column_indices, depth_indices], 2) with tf.control_dependencies([self.probs_t]): log_probs = tf.expand_dims( tf.log(tf.gather_nd(self.probs_t, stacked_actions)), 2) masked_log_probs = log_probs * self.mask_plh self.c_loss = tf.reduce_mean(-tf.reduce_sum( masked_log_probs * (self.c_rewards_plh - baseline), 1)) tf.summary.scalar('c_loss', self.c_loss, collections=c_summary_collection) c_adam = tf.train.AdamOptimizer(self.c_params['lr']) self.c_global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ], dtype=tf.int32) tf.summary.scalar('c_global_step', self.c_global_step, collections=c_summary_collection) self.c_train_op = c_adam.minimize( self.c_loss, global_step=self.c_global_step) self.all_c_summary_t = tf.summary.merge_all(key=self.C_SUMMARIES) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.episode_id_sum = tf.summary.scalar('episode_id', self.episode_id) self.time, self.inc_time_op = capacities.counter("time") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable( 'Qs', shape=[self.nb_state, self.action_space.n], initializer=tf.constant_initializer(self.initial_q_value), dtype=tf.float32) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] # Experienced replay part with tf.variable_scope('Learning'): with tf.variable_scope(fixed_q_scope, reuse=True): fixed_Qs = tf.get_variable('Qs') self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") # Note that we use the fixed Qs to create the targets self.targets_t = capacities.get_q_learning_target( fixed_Qs, self.rewards_plh, self.next_states_plh, self.discount) self.loss, self.train_op = capacities.tabular_learning_with_lr( self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, self.targets_t) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.event_count, self.inc_event_count_op = capacities.counter( "event_count") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = tf.squeeze( capacities.value_f(self.q_params, self.inputs)) self.action_t = capacities.eps_greedy(self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps) self.q_t = self.q_values[self.action_t] fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) with tf.variable_scope('ExperienceReplay'): self.er_inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERInputs") self.er_actions = tf.placeholder(tf.int32, shape=[None], name="ERInputs") self.er_rewards = tf.placeholder(tf.float32, shape=[None], name="ERReward") self.er_next_states = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERNextState") with tf.variable_scope(q_scope, reuse=True): er_q_values = capacities.value_f(self.q_params, self.er_inputs) er_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_actions)[0]), self.er_actions ], 1) er_qs = tf.gather_nd(er_q_values, er_stacked_actions) with tf.variable_scope(fixed_q_scope, reuse=True): er_fixed_next_q_values = capacities.value_f( self.q_params, self.er_next_states) with tf.variable_scope(q_scope, reuse=True): er_next_q_values = capacities.value_f( self.q_params, self.er_next_states) er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1), tf.int32) er_next_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), er_next_max_action_t ], 1) er_next_qs = tf.gather_nd(er_fixed_next_q_values, er_next_stacked_actions) er_target_qs1 = tf.stop_gradient(self.er_rewards + self.discount * er_next_qs) er_target_qs2 = self.er_rewards er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2], 1) select_targets = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), tf.cast(self.er_next_states[:, -1], tf.int32) ], 1) er_target_qs = tf.gather_nd(er_stacked_targets, select_targets) self.er_loss = 1 / 2 * tf.reduce_sum( tf.square(er_target_qs - er_qs)) er_adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.er_train_op = er_adam.minimize( self.er_loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.timestep, self.inc_timestep_op = capacities.counter( "timestep") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): np.random.seed(self.random_seed) with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs') input_shape = tf.shape(self.inputs) dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1] inputs_mat = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']]) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): probs, actions = capacities.policy(self.policy_params, inputs_mat) self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']]) self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1]) self.action_t = self.actions[0, 0, 0] critic_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(critic_scope): critic_values_mat = capacities.value_f(self.critic_params, inputs_mat) self.critic_values = tf.reshape(critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']]) fixed_critic_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_critic_scope): self.update_fixed_vars_op = capacities.fix_scope(critic_scope) with tf.variable_scope('Training'): self.expected_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward") self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh") batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1] line_indices = tf.matmul( # Line indice tf.reshape(tf.range(0, batch_size), [-1, 1]) , tf.ones([1, num_steps], dtype=tf.int32) ) column_indices = tf.matmul( # Column indice tf.ones([batch_size, 1], dtype=tf.int32) , tf.reshape(tf.range(0, num_steps), [1, -1]) ) depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32) stacked_actions = tf.stack( [line_indices, column_indices, depth_indices], 2 ) log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2) self.policy_loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.expected_rewards - tf.stop_gradient(self.critic_values))) * self.mask_plh, 1)) adam = tf.train.AdamOptimizer(self.lr) self.train_policy_op = adam.minimize(self.policy_loss) self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward") self.next_states = tf.placeholder(tf.float32, shape=[None, None, self.critic_params['nb_inputs']], name="next_states") with tf.variable_scope(fixed_critic_scope, reuse=True): next_states_mat = tf.reshape(self.next_states, [-1, self.critic_params['nb_inputs']]) next_critic_values_mat = capacities.value_f(self.critic_params, next_states_mat) next_critic_values = tf.reshape(next_critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']]) target_critics1 = tf.stop_gradient(self.rewards + self.discount * next_critic_values) target_critics2 = self.rewards stacked_targets = tf.stack([tf.squeeze(target_critics1, 2), tf.squeeze(target_critics2, 2)], 2) batch_size, num_steps = tf.shape(self.next_states)[0], tf.shape(self.next_states)[1] line_indices = tf.matmul( # Line indice tf.reshape(tf.range(0, batch_size), [-1, 1]) , tf.ones([1, num_steps], dtype=tf.int32) ) column_indices = tf.matmul( # Column indice tf.ones([batch_size, 1], dtype=tf.int32) , tf.reshape(tf.range(0, num_steps), [1, -1]) ) depth_indices = tf.cast(self.next_states[:, :, -1], tf.int32) select_targets = tf.stack( [line_indices, column_indices, depth_indices], 2 ) target_critics = tf.expand_dims(tf.gather_nd(stacked_targets, select_targets), 2) self.critic_loss = 1/2 * tf.reduce_sum(tf.square(target_critics - self.critic_values) * self.mask_plh) adam = tf.train.AdamOptimizer(self.critic_lr) self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) self.train_critic_op = adam.minimize(self.critic_loss, global_step=self.global_step) self.policy_loss_plh = tf.placeholder(tf.float32, shape=[]) self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh) self.critic_loss_plh = tf.placeholder(tf.float32, shape=[]) self.critic_loss_sum_t = tf.summary.scalar('critic_loss', self.critic_loss_plh) # self.loss_plh = tf.placeholder(tf.float32, shape=[]) # self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('av_score', self.score_plh) self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph