Exemple #1
0
    def build_graph(self, graph):
        self.env.seed(self.random_seed)
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            # Graph of the  LSTM model of the world
            input_scope = tf.VariableScope(reuse=False, name="inputs")
            with tf.variable_scope(input_scope):
                self.state_input_plh = tf.placeholder(
                    tf.float32,
                    shape=[None, None, self.m_params['env_state_size']],
                    name='state_input_plh')
                self.action_input_plh = tf.placeholder(tf.int32,
                                                       shape=[None, None, 1],
                                                       name='action_input_plh')
                self.mask_plh = tf.placeholder(tf.float32,
                                               shape=[None, None, 1],
                                               name="mask_plh")

                input_shape = tf.shape(self.state_input_plh)
                dynamic_batch_size, dynamic_num_steps = input_shape[
                    0], input_shape[1]

                action_input = tf.one_hot(indices=tf.squeeze(
                    self.action_input_plh, 2),
                                          depth=self.m_params['nb_actions'])
                m_inputs = tf.concat([self.state_input_plh, action_input],
                                     2,
                                     name="m_inputs")

            m_scope = tf.VariableScope(reuse=False, name="m")
            with tf.variable_scope(m_scope):
                self.state_reward_preds, self.m_final_state, self.m_initial_state = capacities.predictive_model(
                    self.m_params,
                    m_inputs,
                    dynamic_batch_size,
                    None,
                    summary_collections=[self.M_SUMMARIES])

            fixed_m_scope = tf.VariableScope(reuse=False, name='FixedM')
            with tf.variable_scope(fixed_m_scope):
                self.update_m_fixed_vars_op = capacities.fix_scope(m_scope)

            m_training_scope = tf.VariableScope(reuse=False, name='m_training')
            with tf.variable_scope(m_training_scope):
                self.m_next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, None, self.m_params['env_state_size']],
                    name="m_next_states")
                self.m_rewards = tf.placeholder(tf.float32,
                                                shape=[None, None, 1],
                                                name="m_rewards")
                y_true = tf.concat([self.m_rewards, self.m_next_states], 2)

                with tf.control_dependencies([self.state_reward_preds]):
                    self.m_loss = 1 / 2 * tf.reduce_mean(
                        tf.square(self.state_reward_preds - y_true) *
                        self.mask_plh)
                    tf.summary.scalar('m_loss',
                                      self.m_loss,
                                      collections=[self.M_SUMMARIES])

                m_adam = tf.train.AdamOptimizer(self.m_params['lr'])
                self.m_global_step = tf.Variable(0,
                                                 trainable=False,
                                                 name="m_global_step")
                tf.summary.scalar('m_global_step',
                                  self.m_global_step,
                                  collections=[self.M_SUMMARIES])
                self.m_train_op = m_adam.minimize(
                    self.m_loss, global_step=self.m_global_step)

            self.all_m_summary_t = tf.summary.merge_all(key=self.M_SUMMARIES)

            # Graph of the controller
            c_scope = tf.VariableScope(reuse=False, name="c")
            c_summary_collection = [self.C_SUMMARIES]
            with tf.variable_scope(c_scope):
                # c_cell = LSTMCell(
                #     num_units=self.c_params['nb_units']
                #     , initializer=tf.truncated_normal_initializer(
                #         mean=self.c_params['initial_mean']
                #         , stddev=self.c_params['initial_stddev']
                #     )
                # )
                # self.c_initial_state = c_cell.zero_state(dynamic_batch_size, dtype=tf.float32)
                # c_c_h_states, self.c_final_state = tf.nn.dynamic_rnn(c_cell, self.state_input_plh, initial_state=self.c_initial_state)
                # c_c_states, c_h_states = tf.split(value=c_c_h_states, num_or_size_splits=[self.c_params['nb_units'], self.c_params['nb_units']], axis=2)
                # # Compute the Controller projection
                # self.probs_t, self.actions_t = projection_func(c_h_states)
                m_params = self.m_params
                model_func = lambda m_inputs, m_state: capacities.predictive_model(
                    m_params, m_inputs, dynamic_batch_size, m_state)
                c_params = self.c_params
                projection_func = lambda inputs: capacities.projection(
                    c_params, inputs)
                cm_cell = CMCell(num_units=self.c_params['nb_units'],
                                 m_units=self.m_params['nb_units'],
                                 fixed_model_scope=fixed_m_scope,
                                 model_func=model_func,
                                 projection_func=projection_func,
                                 num_proj=self.c_params['nb_actions'],
                                 initializer=tf.truncated_normal_initializer(
                                     mean=self.c_params['initial_mean'],
                                     stddev=self.c_params['initial_stddev']))

                self.cm_initial_state = cm_cell.zero_state(dynamic_batch_size,
                                                           dtype=tf.float32)
                probs_and_actions_t, self.cm_final_state = tf.nn.dynamic_rnn(
                    cm_cell,
                    self.state_input_plh,
                    initial_state=self.cm_initial_state)
                self.probs_t, actions_t = tf.split(
                    value=probs_and_actions_t,
                    num_or_size_splits=[self.c_params['nb_actions'], 1],
                    axis=2)
                self.actions_t = tf.cast(actions_t, tf.int32)
                # helper tensor used for inference
                self.action_t = self.actions_t[0, 0, 0]

            c_training_scope = tf.VariableScope(reuse=False, name='c_training')
            with tf.variable_scope(c_training_scope):
                self.c_rewards_plh = tf.placeholder(tf.float32,
                                                    shape=[None, None, 1],
                                                    name="c_rewards_plh")

                baseline = tf.reduce_mean(self.c_rewards_plh)

                batch_size, num_steps = tf.shape(self.actions_t)[0], tf.shape(
                    self.actions_t)[1]
                line_indices = tf.matmul(  # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1]),
                    tf.ones([1, num_steps], dtype=tf.int32))
                column_indices = tf.matmul(  # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32),
                    tf.reshape(tf.range(0, num_steps), [1, -1]))
                depth_indices = tf.squeeze(self.actions_t, 2)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2)

                with tf.control_dependencies([self.probs_t]):
                    log_probs = tf.expand_dims(
                        tf.log(tf.gather_nd(self.probs_t, stacked_actions)), 2)
                    masked_log_probs = log_probs * self.mask_plh
                    self.c_loss = tf.reduce_mean(-tf.reduce_sum(
                        masked_log_probs * (self.c_rewards_plh - baseline), 1))
                    tf.summary.scalar('c_loss',
                                      self.c_loss,
                                      collections=c_summary_collection)

                c_adam = tf.train.AdamOptimizer(self.c_params['lr'])
                self.c_global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ],
                    dtype=tf.int32)
                tf.summary.scalar('c_global_step',
                                  self.c_global_step,
                                  collections=c_summary_collection)
                self.c_train_op = c_adam.minimize(
                    self.c_loss, global_step=self.c_global_step)

            self.all_c_summary_t = tf.summary.merge_all(key=self.C_SUMMARIES)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.episode_id_sum = tf.summary.scalar('episode_id',
                                                    self.episode_id)
            self.time, self.inc_time_op = capacities.counter("time")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32,
                                             shape=[None],
                                             name="inputs_plh")

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable(
                    'Qs',
                    shape=[self.nb_state, self.action_space.n],
                    initializer=tf.constant_initializer(self.initial_q_value),
                    dtype=tf.float32)
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh)
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state,
                        self.env.action_space.n, self.N0, self.min_eps)
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            # Experienced replay part
            with tf.variable_scope('Learning'):
                with tf.variable_scope(fixed_q_scope, reuse=True):
                    fixed_Qs = tf.get_variable('Qs')

                self.rewards_plh = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32,
                                                      shape=[None],
                                                      name="next_states_plh")

                # Note that we use the fixed Qs to create the targets
                self.targets_t = capacities.get_q_learning_target(
                    fixed_Qs, self.rewards_plh, self.next_states_plh,
                    self.discount)
                self.loss, self.train_op = capacities.tabular_learning_with_lr(
                    self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh,
                    self.actions_t, self.targets_t)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.event_count, self.inc_event_count_op = capacities.counter(
                "event_count")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(
                    capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(self.inputs, self.q_values,
                                                  self.env.action_space.n,
                                                  self.N0, self.min_eps)
            self.q_t = self.q_values[self.action_t]

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            with tf.variable_scope('ExperienceReplay'):
                self.er_inputs = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERInputs")
                self.er_actions = tf.placeholder(tf.int32,
                                                 shape=[None],
                                                 name="ERInputs")
                self.er_rewards = tf.placeholder(tf.float32,
                                                 shape=[None],
                                                 name="ERReward")
                self.er_next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERNextState")

                with tf.variable_scope(q_scope, reuse=True):
                    er_q_values = capacities.value_f(self.q_params,
                                                     self.er_inputs)
                er_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_actions)[0]), self.er_actions
                ], 1)
                er_qs = tf.gather_nd(er_q_values, er_stacked_actions)

                with tf.variable_scope(fixed_q_scope, reuse=True):
                    er_fixed_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                with tf.variable_scope(q_scope, reuse=True):
                    er_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1),
                                               tf.int32)
                er_next_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    er_next_max_action_t
                ], 1)
                er_next_qs = tf.gather_nd(er_fixed_next_q_values,
                                          er_next_stacked_actions)

                er_target_qs1 = tf.stop_gradient(self.er_rewards +
                                                 self.discount * er_next_qs)
                er_target_qs2 = self.er_rewards
                er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2],
                                              1)
                select_targets = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    tf.cast(self.er_next_states[:, -1], tf.int32)
                ], 1)
                er_target_qs = tf.gather_nd(er_stacked_targets, select_targets)

                self.er_loss = 1 / 2 * tf.reduce_sum(
                    tf.square(er_target_qs - er_qs))
                er_adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.er_train_op = er_adam.minimize(
                    self.er_loss, global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.timestep, self.inc_timestep_op = capacities.counter(
                "timestep")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
    def build_graph(self, graph):
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
            input_shape = tf.shape(self.inputs)
            dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]
            inputs_mat = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                probs, actions = capacities.policy(self.policy_params, inputs_mat)
                self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
                self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
            self.action_t = self.actions[0, 0, 0]

            critic_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(critic_scope):
                critic_values_mat = capacities.value_f(self.critic_params, inputs_mat)
                self.critic_values = tf.reshape(critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

            fixed_critic_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_critic_scope):
                self.update_fixed_vars_op = capacities.fix_scope(critic_scope)

            with tf.variable_scope('Training'):
                self.expected_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")

                batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )
                
                log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
                self.policy_loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.expected_rewards - tf.stop_gradient(self.critic_values))) * self.mask_plh, 1))

                adam = tf.train.AdamOptimizer(self.lr)
                self.train_policy_op = adam.minimize(self.policy_loss)

                self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.next_states = tf.placeholder(tf.float32, shape=[None, None, self.critic_params['nb_inputs']], name="next_states")
                with tf.variable_scope(fixed_critic_scope, reuse=True):
                    next_states_mat = tf.reshape(self.next_states, [-1, self.critic_params['nb_inputs']])
                    next_critic_values_mat = capacities.value_f(self.critic_params, next_states_mat)
                    next_critic_values = tf.reshape(next_critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

                target_critics1 = tf.stop_gradient(self.rewards + self.discount * next_critic_values)
                target_critics2 = self.rewards
                stacked_targets = tf.stack([tf.squeeze(target_critics1, 2), tf.squeeze(target_critics2, 2)], 2)

                batch_size, num_steps = tf.shape(self.next_states)[0], tf.shape(self.next_states)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(self.next_states[:, :, -1], tf.int32)
                select_targets = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )

                target_critics = tf.expand_dims(tf.gather_nd(stacked_targets, select_targets), 2)
                self.critic_loss = 1/2 * tf.reduce_sum(tf.square(target_critics - self.critic_values) * self.mask_plh)

                adam = tf.train.AdamOptimizer(self.critic_lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_critic_op = adam.minimize(self.critic_loss, global_step=self.global_step)

            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh)
            self.critic_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.critic_loss_sum_t = tf.summary.scalar('critic_loss', self.critic_loss_plh)
            # self.loss_plh = tf.placeholder(tf.float32, shape=[])
            # self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph