Ejemplo n.º 1
0
    def build_graph(self, graph):
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            # Dims: bs x num_steps x state_size
            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
            input_shape = tf.shape(self.inputs)
            dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                policy_inputs = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])
                probs, actions = capacities.policy(self.policy_params, policy_inputs)
                self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
                self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
            self.action_t = self.actions[0, 0, 0]

            with tf.variable_scope('Training'):
                self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")

                baseline = tf.reduce_mean(self.rewards)
                
                batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )

                log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
                # We want to average on sequence
                self.loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.rewards - baseline)) * self.mask_plh, 1))

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_op = adam.minimize(self.loss, global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph
Ejemplo n.º 2
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                self.probs, self.actions = capacities.policy(
                    self.policy_params, self.inputs)
            self.action_t = tf.squeeze(self.actions, 1)[0]

            with tf.variable_scope('Training'):
                self.rewards = tf.placeholder(tf.float32,
                                              shape=[None],
                                              name="reward")
                stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.actions)[0]),
                    tf.squeeze(self.actions, 1)
                ], 1)
                log_probs = tf.log(tf.gather_nd(self.probs, stacked_actions))
                # log_probs = tf.Print(log_probs, data=[tf.shape(self.probs), tf.shape(self.actions), tf.shape(log_probs)], message="tf.shape(log_probs):")
                self.loss = -tf.reduce_sum(log_probs * self.rewards)

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.train_op = adam.minimize(self.loss,
                                              global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 3
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32, shape=[None], name="inputs_plh")
            
            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable('Qs'
                    , shape=[self.nb_state, self.action_space.n]
                    , initializer=tf.constant_initializer(self.initial_q_value)
                    , dtype=tf.float32
                )
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh
                    )    
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state, self.env.action_space.n, self.N0, self.min_eps
                    )
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            learning_scope = tf.VariableScope(reuse=False, name='Learning')
            with tf.variable_scope(learning_scope):
                self.rewards_plh = tf.placeholder(tf.float32, shape=[None], name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32, shape=[None], name="next_states_plh")
                self.next_actions_plh = tf.placeholder(tf.int32, shape=[None], name="next_actions_plh")

                targets_t = capacities.get_td_target(self.Qs, self.rewards_plh, self.next_states_plh, self.next_actions_plh, self.discount)
                # When boostraping, the target is non-stationnary, we need a learning rate
                self.loss, self.train_op = capacities.tabular_learning_with_lr(
                    self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh, self.actions_t, targets_t
                )

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph
Ejemplo n.º 4
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                self.probs, self.actions = capacities.policy(
                    self.policy_params, self.inputs)
            self.action_t = tf.squeeze(self.actions, 1)[0]
            # self.action_t = tf.Print(self.action_t, data=[self.probs, self.action_t], message="self.probs, self.action_t:")

            v_scope = tf.VariableScope(reuse=False, name='VValues')
            with tf.variable_scope(v_scope):
                vs = capacities.value_f(self.v_params, self.inputs)

            with tf.control_dependencies([self.probs, vs]):
                with tf.variable_scope('Training'):
                    stacked_actions = tf.stack([
                        tf.range(0,
                                 tf.shape(self.actions)[0]),
                        tf.squeeze(self.actions, 1)
                    ], 1)

                    self.rewards = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards")
                    self.next_states = tf.placeholder(
                        tf.float32,
                        shape=[None, self.observation_space.shape[0] + 1],
                        name="next_states")
                    self.next_actions = tf.placeholder(tf.int32,
                                                       shape=[None],
                                                       name="next_actions")

                    with tf.variable_scope(v_scope, reuse=True):
                        next_vs = tf.squeeze(
                            capacities.value_f(self.v_params,
                                               self.next_states), 1)

                    with tf.variable_scope('TargetVs'):
                        target_vs1 = tf.stop_gradient(self.rewards +
                                                      self.discount * next_vs)
                        target_vs2 = self.rewards
                        stacked_targets = tf.stack([target_vs1, target_vs2], 1)
                        select_targets = tf.stack([
                            tf.range(0,
                                     tf.shape(self.next_states)[0]),
                            tf.cast(self.next_states[:, -1], tf.int32)
                        ], 1)
                        target_vs = tf.gather_nd(stacked_targets,
                                                 select_targets)

                    log_probs = tf.log(
                        tf.gather_nd(self.probs, stacked_actions))

                    with tf.control_dependencies([log_probs, target_vs]):
                        self.v_loss = 1 / 2 * tf.reduce_sum(
                            tf.square(target_vs - vs))
                        v_adam = tf.train.AdamOptimizer(self.v_lr)
                        self.v_global_step = tf.Variable(0,
                                                         trainable=False,
                                                         name="v_global_step")
                        self.v_train_op = v_adam.minimize(
                            self.v_loss, global_step=self.v_global_step)

                        td = target_vs - vs
                        self.policy_loss = -tf.reduce_sum(
                            log_probs * tf.stop_gradient(td))
                        policy_adam = tf.train.AdamOptimizer(self.policy_lr)
                        self.policy_global_step = tf.Variable(
                            0,
                            trainable=False,
                            name="policy_global_step",
                            collections=[
                                tf.GraphKeys.GLOBAL_STEP,
                                tf.GraphKeys.GLOBAL_VARIABLES
                            ])
                        self.policy_train_op = policy_adam.minimize(
                            self.policy_loss,
                            global_step=self.policy_global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss',
                                                       self.policy_loss_plh)
            self.v_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.v_loss_sum_t = tf.summary.scalar('v_loss', self.v_loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 5
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                self.probs, self.actions = capacities.policy(
                    self.policy_params, self.inputs)
            self.action_t = tf.squeeze(self.actions, 1)[0]

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = capacities.value_f(self.q_params, self.inputs)
            self.q = self.q_values[0, tf.stop_gradient(self.action_t)]

            with tf.variable_scope('Training'):
                stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.actions)[0]),
                    tf.squeeze(self.actions, 1)
                ], 1)
                qs = tf.gather_nd(self.q_values, stacked_actions)
                log_probs = tf.log(tf.gather_nd(self.probs, stacked_actions))
                self.policy_loss = -tf.reduce_sum(
                    log_probs * tf.stop_gradient(qs))

                self.rewards = tf.placeholder(tf.float32,
                                              shape=[None],
                                              name="rewards")
                self.next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="next_states")
                self.next_actions = tf.placeholder(tf.int32,
                                                   shape=[None],
                                                   name="next_actions")
                with tf.variable_scope(q_scope, reuse=True):
                    next_q_values = capacities.value_f(self.q_params,
                                                       self.next_states)
                next_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.next_actions)[0]), self.next_actions
                ], 1)
                next_qs = tf.gather_nd(next_q_values, next_stacked_actions)
                target_qs1 = tf.stop_gradient(self.rewards +
                                              self.discount * next_qs)
                target_qs2 = self.rewards
                stacked_targets = tf.stack([target_qs1, target_qs2], 1)
                select_targets = tf.stack([
                    tf.range(0,
                             tf.shape(self.next_states)[0]),
                    tf.cast(self.next_states[:, -1], tf.int32)
                ], 1)
                target_qs = tf.gather_nd(stacked_targets, select_targets)
                self.q_loss = 1 / 2 * tf.reduce_sum(tf.square(target_qs - qs))

                self.loss = self.policy_loss + self.q_scale_lr * self.q_loss

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.train_op = adam.minimize(self.loss,
                                              global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss',
                                                       self.policy_loss_plh)
            self.q_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.q_loss_sum_t = tf.summary.scalar('q_loss', self.q_loss_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 6
0
    def build_graph(self, graph):
        self.env.seed(self.random_seed)
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            # Graph of the  LSTM model of the world
            input_scope = tf.VariableScope(reuse=False, name="inputs")
            with tf.variable_scope(input_scope):
                self.state_input_plh = tf.placeholder(
                    tf.float32,
                    shape=[None, None, self.m_params['env_state_size']],
                    name='state_input_plh')
                self.action_input_plh = tf.placeholder(tf.int32,
                                                       shape=[None, None, 1],
                                                       name='action_input_plh')
                self.mask_plh = tf.placeholder(tf.float32,
                                               shape=[None, None, 1],
                                               name="mask_plh")

                input_shape = tf.shape(self.state_input_plh)
                dynamic_batch_size, dynamic_num_steps = input_shape[
                    0], input_shape[1]

                action_input = tf.one_hot(indices=tf.squeeze(
                    self.action_input_plh, 2),
                                          depth=self.m_params['nb_actions'])
                m_inputs = tf.concat([self.state_input_plh, action_input],
                                     2,
                                     name="m_inputs")

            m_scope = tf.VariableScope(reuse=False, name="m")
            with tf.variable_scope(m_scope):
                self.state_reward_preds, self.m_final_state, self.m_initial_state = capacities.predictive_model(
                    self.m_params,
                    m_inputs,
                    dynamic_batch_size,
                    None,
                    summary_collections=[self.M_SUMMARIES])

            fixed_m_scope = tf.VariableScope(reuse=False, name='FixedM')
            with tf.variable_scope(fixed_m_scope):
                self.update_m_fixed_vars_op = capacities.fix_scope(m_scope)

            m_training_scope = tf.VariableScope(reuse=False, name='m_training')
            with tf.variable_scope(m_training_scope):
                self.m_next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, None, self.m_params['env_state_size']],
                    name="m_next_states")
                self.m_rewards = tf.placeholder(tf.float32,
                                                shape=[None, None, 1],
                                                name="m_rewards")
                y_true = tf.concat([self.m_rewards, self.m_next_states], 2)

                with tf.control_dependencies([self.state_reward_preds]):
                    self.m_loss = 1 / 2 * tf.reduce_mean(
                        tf.square(self.state_reward_preds - y_true) *
                        self.mask_plh)
                    tf.summary.scalar('m_loss',
                                      self.m_loss,
                                      collections=[self.M_SUMMARIES])

                m_adam = tf.train.AdamOptimizer(self.m_params['lr'])
                self.m_global_step = tf.Variable(0,
                                                 trainable=False,
                                                 name="m_global_step")
                tf.summary.scalar('m_global_step',
                                  self.m_global_step,
                                  collections=[self.M_SUMMARIES])
                self.m_train_op = m_adam.minimize(
                    self.m_loss, global_step=self.m_global_step)

            self.all_m_summary_t = tf.summary.merge_all(key=self.M_SUMMARIES)

            # Graph of the controller
            c_scope = tf.VariableScope(reuse=False, name="c")
            c_summary_collection = [self.C_SUMMARIES]
            with tf.variable_scope(c_scope):
                # c_cell = LSTMCell(
                #     num_units=self.c_params['nb_units']
                #     , initializer=tf.truncated_normal_initializer(
                #         mean=self.c_params['initial_mean']
                #         , stddev=self.c_params['initial_stddev']
                #     )
                # )
                # self.c_initial_state = c_cell.zero_state(dynamic_batch_size, dtype=tf.float32)
                # c_c_h_states, self.c_final_state = tf.nn.dynamic_rnn(c_cell, self.state_input_plh, initial_state=self.c_initial_state)
                # c_c_states, c_h_states = tf.split(value=c_c_h_states, num_or_size_splits=[self.c_params['nb_units'], self.c_params['nb_units']], axis=2)
                # # Compute the Controller projection
                # self.probs_t, self.actions_t = projection_func(c_h_states)
                m_params = self.m_params
                model_func = lambda m_inputs, m_state: capacities.predictive_model(
                    m_params, m_inputs, dynamic_batch_size, m_state)
                c_params = self.c_params
                projection_func = lambda inputs: capacities.projection(
                    c_params, inputs)
                cm_cell = CMCell(num_units=self.c_params['nb_units'],
                                 m_units=self.m_params['nb_units'],
                                 fixed_model_scope=fixed_m_scope,
                                 model_func=model_func,
                                 projection_func=projection_func,
                                 num_proj=self.c_params['nb_actions'],
                                 initializer=tf.truncated_normal_initializer(
                                     mean=self.c_params['initial_mean'],
                                     stddev=self.c_params['initial_stddev']))

                self.cm_initial_state = cm_cell.zero_state(dynamic_batch_size,
                                                           dtype=tf.float32)
                probs_and_actions_t, self.cm_final_state = tf.nn.dynamic_rnn(
                    cm_cell,
                    self.state_input_plh,
                    initial_state=self.cm_initial_state)
                self.probs_t, actions_t = tf.split(
                    value=probs_and_actions_t,
                    num_or_size_splits=[self.c_params['nb_actions'], 1],
                    axis=2)
                self.actions_t = tf.cast(actions_t, tf.int32)
                # helper tensor used for inference
                self.action_t = self.actions_t[0, 0, 0]

            c_training_scope = tf.VariableScope(reuse=False, name='c_training')
            with tf.variable_scope(c_training_scope):
                self.c_rewards_plh = tf.placeholder(tf.float32,
                                                    shape=[None, None, 1],
                                                    name="c_rewards_plh")

                baseline = tf.reduce_mean(self.c_rewards_plh)

                batch_size, num_steps = tf.shape(self.actions_t)[0], tf.shape(
                    self.actions_t)[1]
                line_indices = tf.matmul(  # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1]),
                    tf.ones([1, num_steps], dtype=tf.int32))
                column_indices = tf.matmul(  # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32),
                    tf.reshape(tf.range(0, num_steps), [1, -1]))
                depth_indices = tf.squeeze(self.actions_t, 2)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2)

                with tf.control_dependencies([self.probs_t]):
                    log_probs = tf.expand_dims(
                        tf.log(tf.gather_nd(self.probs_t, stacked_actions)), 2)
                    masked_log_probs = log_probs * self.mask_plh
                    self.c_loss = tf.reduce_mean(-tf.reduce_sum(
                        masked_log_probs * (self.c_rewards_plh - baseline), 1))
                    tf.summary.scalar('c_loss',
                                      self.c_loss,
                                      collections=c_summary_collection)

                c_adam = tf.train.AdamOptimizer(self.c_params['lr'])
                self.c_global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ],
                    dtype=tf.int32)
                tf.summary.scalar('c_global_step',
                                  self.c_global_step,
                                  collections=c_summary_collection)
                self.c_train_op = c_adam.minimize(
                    self.c_loss, global_step=self.c_global_step)

            self.all_c_summary_t = tf.summary.merge_all(key=self.C_SUMMARIES)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.episode_id_sum = tf.summary.scalar('episode_id',
                                                    self.episode_id)
            self.time, self.inc_time_op = capacities.counter("time")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 7
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.N0_t = tf.constant(self.N0, tf.float32, name='N_0')
            self.N = tf.Variable(0.,
                                 dtype=tf.float32,
                                 name='N',
                                 trainable=False)
            self.min_eps_t = tf.constant(self.min_eps,
                                         tf.float32,
                                         name='min_eps')

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(
                    capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(self.inputs, self.q_values,
                                                  self.env.action_space.n,
                                                  self.N0, self.min_eps)
            self.q_t = self.q_values[self.action_t]

            with tf.variable_scope('Training'):
                self.reward = tf.placeholder(tf.float32,
                                             shape=[],
                                             name="reward")
                self.next_state = tf.placeholder(
                    tf.float32,
                    shape=[1, self.observation_space.shape[0] + 1],
                    name="nextState")
                self.next_action = tf.placeholder(tf.int32,
                                                  shape=[],
                                                  name="nextAction")

                with tf.variable_scope(q_scope, reuse=True):
                    next_q_values = tf.squeeze(
                        capacities.value_f(self.q_params, self.next_state))
                target_q1 = tf.stop_gradient(self.reward + self.discount *
                                             next_q_values[self.next_action])
                target_q2 = self.reward
                is_done = tf.cast(self.next_state[0, 4], tf.bool)
                target_q = tf.where(is_done, target_q2, target_q1)
                with tf.control_dependencies([target_q]):
                    self.loss = 1 / 2 * tf.square(target_q - self.q_t)

                adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.train_op = adam.minimize(self.loss,
                                              global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 8
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(
                tf.float32,
                shape=[None, self.observation_space.shape[0] + 1],
                name='inputs')

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.q_values = tf.squeeze(
                    capacities.value_f(self.q_params, self.inputs))

            self.action_t = capacities.eps_greedy(self.inputs, self.q_values,
                                                  self.env.action_space.n,
                                                  self.N0, self.min_eps)
            self.q_t = self.q_values[self.action_t]

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            with tf.variable_scope('ExperienceReplay'):
                self.er_inputs = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERInputs")
                self.er_actions = tf.placeholder(tf.int32,
                                                 shape=[None],
                                                 name="ERInputs")
                self.er_rewards = tf.placeholder(tf.float32,
                                                 shape=[None],
                                                 name="ERReward")
                self.er_next_states = tf.placeholder(
                    tf.float32,
                    shape=[None, self.observation_space.shape[0] + 1],
                    name="ERNextState")

                with tf.variable_scope(q_scope, reuse=True):
                    er_q_values = capacities.value_f(self.q_params,
                                                     self.er_inputs)
                er_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_actions)[0]), self.er_actions
                ], 1)
                er_qs = tf.gather_nd(er_q_values, er_stacked_actions)

                with tf.variable_scope(fixed_q_scope, reuse=True):
                    er_fixed_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                with tf.variable_scope(q_scope, reuse=True):
                    er_next_q_values = capacities.value_f(
                        self.q_params, self.er_next_states)
                er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1),
                                               tf.int32)
                er_next_stacked_actions = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    er_next_max_action_t
                ], 1)
                er_next_qs = tf.gather_nd(er_fixed_next_q_values,
                                          er_next_stacked_actions)

                er_target_qs1 = tf.stop_gradient(self.er_rewards +
                                                 self.discount * er_next_qs)
                er_target_qs2 = self.er_rewards
                er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2],
                                              1)
                select_targets = tf.stack([
                    tf.range(0,
                             tf.shape(self.er_next_states)[0]),
                    tf.cast(self.er_next_states[:, -1], tf.int32)
                ], 1)
                er_target_qs = tf.gather_nd(er_stacked_targets, select_targets)

                self.er_loss = 1 / 2 * tf.reduce_sum(
                    tf.square(er_target_qs - er_qs))
                er_adam = tf.train.AdamOptimizer(self.lr)
                self.global_step = tf.Variable(
                    0,
                    trainable=False,
                    name="global_step",
                    collections=[
                        tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES
                    ])
                self.er_train_op = er_adam.minimize(
                    self.er_loss, global_step=self.global_step)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.timestep, self.inc_timestep_op = capacities.counter(
                "timestep")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32,
                                             shape=[None],
                                             name="inputs_plh")

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable(
                    'Qs',
                    shape=[self.nb_state, self.action_space.n],
                    initializer=tf.constant_initializer(self.initial_q_value),
                    dtype=tf.float32)
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_q_scope):
                self.update_fixed_vars_op = capacities.fix_scope(q_scope)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh)
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state,
                        self.env.action_space.n, self.N0, self.min_eps)
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            # Experienced replay part
            with tf.variable_scope('Learning'):
                with tf.variable_scope(fixed_q_scope, reuse=True):
                    fixed_Qs = tf.get_variable('Qs')

                self.rewards_plh = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32,
                                                      shape=[None],
                                                      name="next_states_plh")

                # Note that we use the fixed Qs to create the targets
                self.targets_t = capacities.get_q_learning_target(
                    fixed_Qs, self.rewards_plh, self.next_states_plh,
                    self.discount)
                self.loss, self.train_op = capacities.tabular_learning_with_lr(
                    self.lr, self.lr_decay_steps, self.Qs, self.inputs_plh,
                    self.actions_t, self.targets_t)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")
            self.event_count, self.inc_event_count_op = capacities.counter(
                "event_count")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 10
0
    def build_graph(self, graph):
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs_plh = tf.placeholder(tf.int32,
                                             shape=[None],
                                             name="inputs_plh")

            q_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(q_scope):
                self.Qs = tf.get_variable(
                    'Qs',
                    shape=[self.nb_state, self.action_space.n],
                    initializer=tf.constant_initializer(self.initial_q_value),
                    dtype=tf.float32)
                tf.summary.histogram('Qarray', self.Qs)
                self.q_preds_t = tf.gather(self.Qs, self.inputs_plh)

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                if 'UCB' in self.config and self.config['UCB']:
                    self.actions_t, self.probs_t = capacities.tabular_UCB(
                        self.Qs, self.inputs_plh)
                else:
                    self.actions_t, self.probs_t = capacities.tabular_eps_greedy(
                        self.inputs_plh, self.q_preds_t, self.nb_state,
                        self.env.action_space.n, self.N0, self.min_eps)
                self.action_t = self.actions_t[0]
                self.q_value_t = self.q_preds_t[0][self.action_t]

            et_scope = tf.VariableScope(reuse=False, name='EligibilityTraces')
            with tf.variable_scope(et_scope):
                et, update_et_op, self.reset_et_op = capacities.eligibility_traces(
                    self.Qs, self.inputs_plh, self.actions_t, self.discount,
                    self.lambda_value)

            with tf.variable_scope('Learning'):
                self.rewards_plh = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="rewards_plh")
                self.next_states_plh = tf.placeholder(tf.int32,
                                                      shape=[None],
                                                      name="next_states_plh")

                self.targets_t = capacities.get_q_learning_target(
                    self.Qs, self.rewards_plh, self.next_states_plh,
                    self.discount)
                target = self.targets_t[0]
                state_action_pairs = tf.stack(
                    [self.inputs_plh, self.actions_t], 1)
                estimate = tf.gather_nd(self.Qs, state_action_pairs)[0]
                err_estimate = target - estimate

                global_step = tf.Variable(0,
                                          trainable=False,
                                          name="global_step",
                                          collections=[
                                              tf.GraphKeys.GLOBAL_STEP,
                                              tf.GraphKeys.GLOBAL_VARIABLES
                                          ])
                lr = tf.train.exponential_decay(tf.constant(self.lr,
                                                            dtype=tf.float32),
                                                global_step,
                                                self.lr_decay_steps,
                                                0.5,
                                                staircase=True)
                tf.summary.scalar('lr', lr)
                inc_global_step = global_step.assign_add(1)
                with tf.control_dependencies([update_et_op, inc_global_step]):
                    self.loss = tf.reduce_sum(err_estimate * et)
                    self.train_op = tf.assign_add(self.Qs,
                                                  lr * err_estimate * et)

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('score', self.score_plh)
            self.loss_plh = tf.placeholder(tf.float32, shape=[])
            self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.episode_id, self.inc_ep_id_op = capacities.counter(
                "episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score',
                                                  self.pscore_plh)

        return graph
Ejemplo n.º 11
0
    def build_graph(self, graph):
        np.random.seed(self.random_seed)
        with graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs')
            input_shape = tf.shape(self.inputs)
            dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1]
            inputs_mat = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']])

            policy_scope = tf.VariableScope(reuse=False, name='Policy')
            with tf.variable_scope(policy_scope):
                probs, actions = capacities.policy(self.policy_params, inputs_mat)
                self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']])
                self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1])
            self.action_t = self.actions[0, 0, 0]

            critic_scope = tf.VariableScope(reuse=False, name='QValues')
            with tf.variable_scope(critic_scope):
                critic_values_mat = capacities.value_f(self.critic_params, inputs_mat)
                self.critic_values = tf.reshape(critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

            fixed_critic_scope = tf.VariableScope(reuse=False, name='FixedQValues')
            with tf.variable_scope(fixed_critic_scope):
                self.update_fixed_vars_op = capacities.fix_scope(critic_scope)

            with tf.variable_scope('Training'):
                self.expected_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh")

                batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32)
                stacked_actions = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )
                
                log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2)
                self.policy_loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.expected_rewards - tf.stop_gradient(self.critic_values))) * self.mask_plh, 1))

                adam = tf.train.AdamOptimizer(self.lr)
                self.train_policy_op = adam.minimize(self.policy_loss)

                self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward")
                self.next_states = tf.placeholder(tf.float32, shape=[None, None, self.critic_params['nb_inputs']], name="next_states")
                with tf.variable_scope(fixed_critic_scope, reuse=True):
                    next_states_mat = tf.reshape(self.next_states, [-1, self.critic_params['nb_inputs']])
                    next_critic_values_mat = capacities.value_f(self.critic_params, next_states_mat)
                    next_critic_values = tf.reshape(next_critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']])

                target_critics1 = tf.stop_gradient(self.rewards + self.discount * next_critic_values)
                target_critics2 = self.rewards
                stacked_targets = tf.stack([tf.squeeze(target_critics1, 2), tf.squeeze(target_critics2, 2)], 2)

                batch_size, num_steps = tf.shape(self.next_states)[0], tf.shape(self.next_states)[1]
                line_indices = tf.matmul( # Line indice
                    tf.reshape(tf.range(0, batch_size), [-1, 1])
                    , tf.ones([1, num_steps], dtype=tf.int32)
                )
                column_indices = tf.matmul( # Column indice
                    tf.ones([batch_size, 1], dtype=tf.int32)
                    , tf.reshape(tf.range(0, num_steps), [1, -1])
                )
                depth_indices = tf.cast(self.next_states[:, :, -1], tf.int32)
                select_targets = tf.stack(
                    [line_indices, column_indices, depth_indices], 2
                )

                target_critics = tf.expand_dims(tf.gather_nd(stacked_targets, select_targets), 2)
                self.critic_loss = 1/2 * tf.reduce_sum(tf.square(target_critics - self.critic_values) * self.mask_plh)

                adam = tf.train.AdamOptimizer(self.critic_lr)
                self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
                self.train_critic_op = adam.minimize(self.critic_loss, global_step=self.global_step)

            self.policy_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh)
            self.critic_loss_plh = tf.placeholder(tf.float32, shape=[])
            self.critic_loss_sum_t = tf.summary.scalar('critic_loss', self.critic_loss_plh)
            # self.loss_plh = tf.placeholder(tf.float32, shape=[])
            # self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh)
            self.all_summary_t = tf.summary.merge_all()

            self.score_plh = tf.placeholder(tf.float32, shape=[])
            self.score_sum_t = tf.summary.scalar('av_score', self.score_plh)

            self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id")

            # Playing part
            self.pscore_plh = tf.placeholder(tf.float32, shape=[])
            self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh)

        return graph