예제 #1
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                self.probs, self.actions = capacities.policy(
                    self.policy_params, self.inputs)
            self.action_t = tf.squeeze(self.actions, 1)[0]
            # self.action_t = tf.Print(self.action_t, data=[self.probs, self.action_t], message="self.probs, self.action_t:")

            v_scope = tf.VariableScope(reuse=False, name='VValues')
            with tf.variable_scope(v_scope):
                vs = capacities.value_f(self.v_params, self.inputs)

            with tf.control_dependencies([self.probs, vs]):
                with tf.variable_scope('Training'):
                    stacked_actions = tf.stack([
                        tf.range(0,
                                 tf.shape(self.actions)[0]),
                        tf.squeeze(self.actions, 1)
                    ], 1)

                    self.rewards = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards")
                    self.next_states = tf.placeholder(
                        tf.float32,
                        shape=[None, self.observation_space.shape[0] + 1],
                        name="next_states")
                    self.next_actions = tf.placeholder(tf.int32,
                                                       shape=[None],
                                                       name="next_actions")

                    with tf.variable_scope(v_scope, reuse=True):
                        next_vs = tf.squeeze(
                            capacities.value_f(self.v_params,
                                               self.next_states), 1)

                    with tf.variable_scope('TargetVs'):
                        target_vs1 = tf.stop_gradient(self.rewards +
                                                      self.discount * next_vs)
                        target_vs2 = self.rewards
                        stacked_targets = tf.stack([target_vs1, target_vs2], 1)
                        select_targets = tf.stack([
                            tf.range(0,
                                     tf.shape(self.next_states)[0]),
                            tf.cast(self.next_states[:, -1], tf.int32)
                        ], 1)
                        target_vs = tf.gather_nd(stacked_targets,
                                                 select_targets)

                    log_probs = tf.log(
                        tf.gather_nd(self.probs, stacked_actions))

                    with tf.control_dependencies([log_probs, target_vs]):
                        self.v_loss = 1 / 2 * tf.reduce_sum(
                            tf.square(target_vs - vs))
                        v_adam = tf.train.AdamOptimizer(self.v_lr)
                        self.v_global_step = tf.Variable(0,
                                                         trainable=False,
                                                         name="v_global_step")
                        self.v_train_op = v_adam.minimize(
                            self.v_loss, global_step=self.v_global_step)

                        td = target_vs - vs
                        self.policy_loss = -tf.reduce_sum(
                            log_probs * tf.stop_gradient(td))
                        policy_adam = tf.train.AdamOptimizer(self.policy_lr)
                        self.policy_global_step = tf.Variable(
                            0,
                            trainable=False,
                            name="policy_global_step",
                            collections=[
                                tf.GraphKeys.GLOBAL_STEP,
                                tf.GraphKeys.GLOBAL_VARIABLES
                            ])
                        self.policy_train_op = policy_adam.minimize(
                            self.policy_loss,
                            global_step=self.policy_global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss',
                                                       self.policy_loss_plh)
            self.v_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.v_loss_sum_t = tf.summary.scalar('v_loss', self.v_loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
예제 #2
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                self.probs, self.actions = capacities.policy(
                    self.policy_params, self.inputs)
            self.action_t = tf.squeeze(self.actions, 1)[0]

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = capacities.value_f(self.q_params, self.inputs)
            self.q = self.q_values[0, tf.stop_gradient(self.action_t)]

            with tf.variable_scope('Training'):
                stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.actions)[0]),
                    tf.squeeze(self.actions, 1)
                ], 1)
                qs = tf.gather_nd(self.q_values, stacked_actions)
                log_probs = tf.log(tf.gather_nd(self.probs, stacked_actions))
                self.policy_loss = -tf.reduce_sum(
                    log_probs * tf.stop_gradient(qs))

                self.rewards = tf.placeholder(tf.float32,
                                              shape=[None],
                                              name="rewards")
                self.next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="next_states")
                self.next_actions = tf.placeholder(tf.int32,
                                                   shape=[None],
                                                   name="next_actions")
                with tf.variable_scope(q_scope, reuse=True):
                    next_q_values = capacities.value_f(self.q_params,
                                                       self.next_states)
                next_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.next_actions)[0]), self.next_actions
                ], 1)
                next_qs = tf.gather_nd(next_q_values, next_stacked_actions)
                target_qs1 = tf.stop_gradient(self.rewards +
                                              self.discount * next_qs)
                target_qs2 = self.rewards
                stacked_targets = tf.stack([target_qs1, target_qs2], 1)
                select_targets = tf.stack([
                    tf.range(0,
                             tf.shape(self.next_states)[0]),
                    tf.cast(self.next_states[:, -1], tf.int32)
                ], 1)
                target_qs = tf.gather_nd(stacked_targets, select_targets)
                self.q_loss = 1 / 2 * tf.reduce_sum(tf.square(target_qs - qs))

                self.loss = self.policy_loss + self.q_scale_lr * self.q_loss

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.train_op = adam.minimize(self.loss,
                                              global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss',
                                                       self.policy_loss_plh)
            self.q_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.q_loss_sum_t = tf.summary.scalar('q_loss', self.q_loss_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
예제 #3
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(
                    capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(self.inputs, self.q_values,
                                                  self.env.action_space.n,
                                                  self.N0, self.min_eps)
            self.q_t = self.q_values[self.action_t]

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            with tf.variable_scope('ExperienceReplay'):
                self.er_inputs = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERInputs")
                self.er_actions = tf.placeholder(tf.int32,
                                                 shape=[None],
                                                 name="ERInputs")
                self.er_rewards = tf.placeholder(tf.float32,
                                                 shape=[None],
                                                 name="ERReward")
                self.er_next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERNextState")

                with tf.variable_scope(q_scope, reuse=True):
                    er_q_values = capacities.value_f(self.q_params,
                                                     self.er_inputs)
                er_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_actions)[0]), self.er_actions
                ], 1)
                er_qs = tf.gather_nd(er_q_values, er_stacked_actions)

                with tf.variable_scope(fixed_q_scope, reuse=True):
                    er_fixed_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                with tf.variable_scope(q_scope, reuse=True):
                    er_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1),
                                               tf.int32)
                er_next_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    er_next_max_action_t
                ], 1)
                er_next_qs = tf.gather_nd(er_fixed_next_q_values,
                                          er_next_stacked_actions)

                er_target_qs1 = tf.stop_gradient(self.er_rewards +
                                                 self.discount * er_next_qs)
                er_target_qs2 = self.er_rewards
                er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2],
                                              1)
                select_targets = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    tf.cast(self.er_next_states[:, -1], tf.int32)
                ], 1)
                er_target_qs = tf.gather_nd(er_stacked_targets, select_targets)

                self.er_loss = 1 / 2 * tf.reduce_sum(
                    tf.square(er_target_qs - er_qs))
                er_adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.er_train_op = er_adam.minimize(
                    self.er_loss, global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.timestep, self.inc_timestep_op = capacities.counter(
                "timestep")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
예제 #4
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.N0_t = tf.constant(self.N0, tf.float32, name='N_0')
            self.N = tf.Variable(0.,
                                 dtype=tf.float32,
                                 name='N',
                                 trainable=False)
            self.min_eps_t = tf.constant(self.min_eps,
                                         tf.float32,
                                         name='min_eps')

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(
                    capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(self.inputs, self.q_values,
                                                  self.env.action_space.n,
                                                  self.N0, self.min_eps)
            self.q_t = self.q_values[self.action_t]

            with tf.variable_scope('Training'):
                self.reward = tf.placeholder(tf.float32,
                                             shape=[],
                                             name="reward")
                self.next_state = tf.placeholder(
                    tf.float32,
                    shape=[1, self.observation_space.shape[0] + 1],
                    name="nextState")
                self.next_action = tf.placeholder(tf.int32,
                                                  shape=[],
                                                  name="nextAction")

                with tf.variable_scope(q_scope, reuse=True):
                    next_q_values = tf.squeeze(
                        capacities.value_f(self.q_params, self.next_state))
                target_q1 = tf.stop_gradient(self.reward + self.discount *
                                             next_q_values[self.next_action])
                target_q2 = self.reward
                is_done = tf.cast(self.next_state[0, 4], tf.bool)
                target_q = tf.where(is_done, target_q2, target_q1)
                with tf.control_dependencies([target_q]):
                    self.loss = 1 / 2 * tf.square(target_q - self.q_t)

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.train_op = adam.minimize(self.loss,
                                              global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
예제 #5
0
    def build_graph(self, graph):
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
            input_shape = tf.shape(self.inputs)
            dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]
            inputs_mat = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                probs, actions = capacities.policy(self.policy_params, inputs_mat)
                self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
                self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
            self.action_t = self.actions[0, 0, 0]

            critic_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(critic_scope):
                critic_values_mat = capacities.value_f(self.critic_params, inputs_mat)
                self.critic_values = tf.reshape(critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

            fixed_critic_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_critic_scope):
                self.update_fixed_vars_op = capacities.fix_scope(critic_scope)

            with tf.variable_scope('Training'):
                self.expected_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")

                batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )
                
                log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
                self.policy_loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.expected_rewards - tf.stop_gradient(self.critic_values))) * self.mask_plh, 1))

                adam = tf.train.AdamOptimizer(self.lr)
                self.train_policy_op = adam.minimize(self.policy_loss)

                self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.next_states = tf.placeholder(tf.float32, shape=[None, None, self.critic_params['nb_inputs']], name="next_states")
                with tf.variable_scope(fixed_critic_scope, reuse=True):
                    next_states_mat = tf.reshape(self.next_states, [-1, self.critic_params['nb_inputs']])
                    next_critic_values_mat = capacities.value_f(self.critic_params, next_states_mat)
                    next_critic_values = tf.reshape(next_critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

                target_critics1 = tf.stop_gradient(self.rewards + self.discount * next_critic_values)
                target_critics2 = self.rewards
                stacked_targets = tf.stack([tf.squeeze(target_critics1, 2), tf.squeeze(target_critics2, 2)], 2)

                batch_size, num_steps = tf.shape(self.next_states)[0], tf.shape(self.next_states)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(self.next_states[:, :, -1], tf.int32)
                select_targets = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )

                target_critics = tf.expand_dims(tf.gather_nd(stacked_targets, select_targets), 2)
                self.critic_loss = 1/2 * tf.reduce_sum(tf.square(target_critics - self.critic_values) * self.mask_plh)

                adam = tf.train.AdamOptimizer(self.critic_lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_critic_op = adam.minimize(self.critic_loss, global_step=self.global_step)

            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh)
            self.critic_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.critic_loss_sum_t = tf.summary.scalar('critic_loss', self.critic_loss_plh)
            # self.loss_plh = tf.placeholder(tf.float32, shape=[])
            # self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph