def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh")
            
            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable('Qs'
                    , shape=[self.nb_state, self.action_space.n]
                    , initializer=tf.constant_initializer(self.initial_q_value)
                    , dtype=tf.float32
                )
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh
                    )    
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps
                    )
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            learning_scope = tf.VariableScope(reuse=False, name='Learning')
            with tf.variable_scope(learning_scope):
                self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh")
                self.next_actions_plh = tf.placeholder(tf.int32, shape=[None], name="next_actions_plh")

                targets_t = capacities.get_td_target(self.Qs, self.rewards_plh, self.next_states_plh, self.next_actions_plh, self.discount)
                # When boostraping, the target is non-stationnary, we need a learning rate
                self.loss, self.train_op = capacities.tabular_learning_with_lr(
                    self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, targets_t
                )

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32,
                                             shape=[None],
                                             name="inputs_plh")

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable(
                    'Qs',
                    shape=[self.nb_state, self.action_space.n],
                    initializer=tf.constant_initializer(self.initial_q_value),
                    dtype=tf.float32)
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh)
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state,
                        self.env.action_space.n, self.N0, self.min_eps)
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            # Experienced replay part
            with tf.variable_scope('Learning'):
                with tf.variable_scope(fixed_q_scope, reuse=True):
                    fixed_Qs = tf.get_variable('Qs')

                self.rewards_plh = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32,
                                                      shape=[None],
                                                      name="next_states_plh")

                # Note that we use the fixed Qs to create the targets
                self.targets_t = capacities.get_q_learning_target(
                    fixed_Qs, self.rewards_plh, self.next_states_plh,
                    self.discount)
                self.loss, self.train_op = capacities.tabular_learning_with_lr(
                    self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh,
                    self.actions_t, self.targets_t)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.event_count, self.inc_event_count_op = capacities.counter(
                "event_count")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Example #3
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32,
                                             shape=[None],
                                             name="inputs_plh")

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable(
                    'Qs',
                    shape=[self.nb_state, self.action_space.n],
                    initializer=tf.constant_initializer(self.initial_q_value),
                    dtype=tf.float32)
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh)
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state,
                        self.env.action_space.n, self.N0, self.min_eps)
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            et_scope = tf.VariableScope(reuse=False, name='EligibilityTraces')
            with tf.variable_scope(et_scope):
                et, update_et_op, self.reset_et_op = capacities.eligibility_traces(
                    self.Qs, self.inputs_plh, self.actions_t, self.discount,
                    self.lambda_value)

            with tf.variable_scope('Learning'):
                self.rewards_plh = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32,
                                                      shape=[None],
                                                      name="next_states_plh")

                self.targets_t = capacities.get_q_learning_target(
                    self.Qs, self.rewards_plh, self.next_states_plh,
                    self.discount)
                target = self.targets_t[0]
                state_action_pairs = tf.stack(
                    [self.inputs_plh, self.actions_t], 1)
                estimate = tf.gather_nd(self.Qs, state_action_pairs)[0]
                err_estimate = target - estimate

                global_step = tf.Variable(0,
                                          trainable=False,
                                          name="global_step",
                                          collections=[
                                              tf.GraphKeys.GLOBAL_STEP,
                                              tf.GraphKeys.GLOBAL_VARIABLES
                                          ])
                lr = tf.train.exponential_decay(tf.constant(self.lr,
                                                            dtype=tf.float32),
                                                global_step,
                                                self.lr_decay_steps,
                                                0.5,
                                                staircase=True)
                tf.summary.scalar('lr', lr)
                inc_global_step = global_step.assign_add(1)
                with tf.control_dependencies([update_et_op, inc_global_step]):
                    self.loss = tf.reduce_sum(err_estimate * et)
                    self.train_op = tf.assign_add(self.Qs,
                                                  lr * err_estimate * et)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph