def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable('Qs' , shape=[self.nb_state, self.action_space.n] , initializer=tf.constant_initializer(self.initial_q_value) , dtype=tf.float32 ) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh ) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps ) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] learning_scope = tf.VariableScope(reuse=False, name='Learning') with tf.variable_scope(learning_scope): self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") self.next_actions_plh = tf.placeholder(tf.int32, shape=[None], name="next_actions_plh") targets_t = capacities.get_td_target(self.Qs, self.rewards_plh, self.next_states_plh, self.next_actions_plh, self.discount) # When boostraping, the target is non-stationnary, we need a learning rate self.loss, self.train_op = capacities.tabular_learning_with_lr( self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, targets_t ) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable( 'Qs', shape=[self.nb_state, self.action_space.n], initializer=tf.constant_initializer(self.initial_q_value), dtype=tf.float32) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] # Experienced replay part with tf.variable_scope('Learning'): with tf.variable_scope(fixed_q_scope, reuse=True): fixed_Qs = tf.get_variable('Qs') self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") # Note that we use the fixed Qs to create the targets self.targets_t = capacities.get_q_learning_target( fixed_Qs, self.rewards_plh, self.next_states_plh, self.discount) self.loss, self.train_op = capacities.tabular_learning_with_lr( self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, self.targets_t) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.event_count, self.inc_event_count_op = capacities.counter( "event_count") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph