def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable('Qs' , shape=[self.nb_state, self.action_space.n] , initializer=tf.constant_initializer(self.initial_q_value) , dtype=tf.float32 ) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh ) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps ) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] learning_scope = tf.VariableScope(reuse=False, name='Learning') with tf.variable_scope(learning_scope): self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") self.next_actions_plh = tf.placeholder(tf.int32, shape=[None], name="next_actions_plh") targets_t = capacities.get_td_target(self.Qs, self.rewards_plh, self.next_states_plh, self.next_actions_plh, self.discount) # When boostraping, the target is non-stationnary, we need a learning rate self.loss, self.train_op = capacities.tabular_learning_with_lr( self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, targets_t ) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable( 'Qs', shape=[self.nb_state, self.action_space.n], initializer=tf.constant_initializer(self.initial_q_value), dtype=tf.float32) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] # Experienced replay part with tf.variable_scope('Learning'): with tf.variable_scope(fixed_q_scope, reuse=True): fixed_Qs = tf.get_variable('Qs') self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") # Note that we use the fixed Qs to create the targets self.targets_t = capacities.get_q_learning_target( fixed_Qs, self.rewards_plh, self.next_states_plh, self.discount) self.loss, self.train_op = capacities.tabular_learning_with_lr( self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, self.targets_t) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.event_count, self.inc_event_count_op = capacities.counter( "event_count") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh") q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.Qs = tf.get_variable( 'Qs', shape=[self.nb_state, self.action_space.n], initializer=tf.constant_initializer(self.initial_q_value), dtype=tf.float32) tf.summary.histogram('Qarray', self.Qs) self.q_preds_t = tf.gather(self.Qs, self.inputs_plh) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): if 'UCB' in self.config and self.config['UCB']: self.actions_t, self.probs_t = capacities.tabular_UCB( self.Qs, self.inputs_plh) else: self.actions_t, self.probs_t = capacities.tabular_eps_greedy( self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps) self.action_t = self.actions_t[0] self.q_value_t = self.q_preds_t[0][self.action_t] et_scope = tf.VariableScope(reuse=False, name='EligibilityTraces') with tf.variable_scope(et_scope): et, update_et_op, self.reset_et_op = capacities.eligibility_traces( self.Qs, self.inputs_plh, self.actions_t, self.discount, self.lambda_value) with tf.variable_scope('Learning'): self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh") self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh") self.targets_t = capacities.get_q_learning_target( self.Qs, self.rewards_plh, self.next_states_plh, self.discount) target = self.targets_t[0] state_action_pairs = tf.stack( [self.inputs_plh, self.actions_t], 1) estimate = tf.gather_nd(self.Qs, state_action_pairs)[0] err_estimate = target - estimate global_step = tf.Variable(0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) lr = tf.train.exponential_decay(tf.constant(self.lr, dtype=tf.float32), global_step, self.lr_decay_steps, 0.5, staircase=True) tf.summary.scalar('lr', lr) inc_global_step = global_step.assign_add(1) with tf.control_dependencies([update_et_op, inc_global_step]): self.loss = tf.reduce_sum(err_estimate * et) self.train_op = tf.assign_add(self.Qs, lr * err_estimate * et) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph