Ejemplo n.º 1
0
    def build_graph(self):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_network = Network()
        self.actor = sg_network.actor
        self.critic = sg_network.critic

        if da3c_config.config.optimizer == 'Adam':
            sg_actor_optimizer = optimizer.AdamOptimizer(
                da3c_config.config.initial_learning_rate)
            sg_critic_optimizer = optimizer.AdamOptimizer(
                da3c_config.config.initial_learning_rate)
        else:
            sg_learning_rate = da3c_graph.LearningRate(
                sg_global_step, da3c_config.config.initial_learning_rate)
            sg_actor_optimizer = optimizer.RMSPropOptimizer(
                learning_rate=sg_learning_rate,
                decay=da3c_config.config.RMSProp.decay,
                momentum=0.0,
                epsilon=da3c_config.config.RMSProp.epsilon)
            sg_critic_optimizer = optimizer.RMSPropOptimizer(
                learning_rate=sg_learning_rate,
                decay=da3c_config.config.RMSProp.decay,
                momentum=0.0,
                epsilon=da3c_config.config.RMSProp.epsilon)
        sg_actor_gradients = optimizer.Gradients(self.actor.weights,
                                                 optimizer=sg_actor_optimizer)
        sg_critic_gradients = optimizer.Gradients(
            self.critic.weights, optimizer=sg_critic_optimizer)

        if da3c_config.config.use_icm:
            sg_icm_optimizer = optimizer.AdamOptimizer(
                da3c_config.config.icm.lr)
            sg_icm_weights = icm_model.ICM().weights
            sg_icm_gradients = optimizer.Gradients(sg_icm_weights,
                                                   optimizer=sg_icm_optimizer)

            # Expose ICM public API
            self.op_icm_get_weights = self.Op(sg_icm_weights)
            self.op_icm_apply_gradients = self.Op(
                sg_icm_gradients.apply,
                gradients=sg_icm_gradients.ph_gradients)

        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_check_weights = self.Ops(self.actor.weights.check,
                                         self.critic.weights.check)
        self.op_get_weights = self.Ops(self.actor.weights, self.critic.weights)
        self.op_apply_gradients = self.Ops(
            sg_actor_gradients.apply,
            sg_critic_gradients.apply,
            sg_global_step.increment,
            gradients=(sg_actor_gradients.ph_gradients,
                       sg_critic_gradients.ph_gradients),
            increment=sg_global_step.ph_increment)
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 2
0
    def build_graph(self):
        # Build graph
        sg_network = Network()

        sg_loss = loss.DA3CLoss(sg_network.actor, sg_network.critic, da3c_config.config)
        sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_loss,
                                           norm=da3c_config.config.gradients_norm_clipping)

        if da3c_config.config.use_icm:
            sg_icm_network = icm_model.ICM()
            sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights, loss=sg_icm_network.loss)

            # Expose ICM public API
            self.op_icm_assign_weights = self.Op(sg_icm_network.weights.assign,
                                                 weights=sg_icm_network.weights.ph_weights)

            feeds = dict(state=sg_icm_network.ph_state, probs=sg_icm_network.ph_probs)
            self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out, **feeds)

            feeds.update(dict(action=sg_icm_network.ph_taken))
            self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate, **feeds)

        batch_size = tf.to_float(tf.shape(sg_network.ph_state.node)[0])

        summaries = tf.summary.merge([
            tf.summary.scalar('policy_loss', sg_loss.policy_loss / batch_size),
            tf.summary.scalar('value_loss', sg_loss.value_loss / batch_size),
            tf.summary.scalar('entropy', sg_loss.entropy / batch_size),
            tf.summary.scalar('gradients_global_norm', sg_gradients.global_norm),
            tf.summary.scalar('weights_global_norm', sg_network.weights.global_norm)])

        # Expose public API
        self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights)

        feeds = dict(state=sg_network.ph_state, action=sg_loss.ph_action,
                     advantage=sg_loss.ph_advantage, discounted_reward=sg_loss.ph_discounted_reward)

        if da3c_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_network.ph_lstm_state))
            self.lstm_zero_state = sg_network.lstm_zero_state
            self.op_get_action_value_and_lstm_state = self.Ops(sg_network.actor, sg_network.critic,
                                                               sg_network.lstm_state,
                                                               state=sg_network.ph_state,
                                                               lstm_state=sg_network.ph_lstm_state)
        else:
            self.op_get_action_and_value = self.Ops(sg_network.actor, sg_network.critic,
                                                    state=sg_network.ph_state)

        self.op_compute_gradients_and_summaries = self.Ops(sg_gradients.calculate, summaries, **feeds)
Ejemplo n.º 3
0
    def build_graph(self):
        sg_weights = _ManagerNetwork().weights

        sg_global_step = graph.GlobalStep()
        # self.learning_rate_input = graph.Placeholder(np.float32, shape=(1,), name="manager_lr")
        # tf.placeholder(tf.float32, [], name="manager_lr")
        sg_learning_rate = fun_graph.LearningRate(sg_global_step)

        sg_optimizer = optimizer.RMSPropOptimizer(
            learning_rate=sg_learning_rate,
            decay=cfg.RMSProp.decay,
            momentum=0.0,
            epsilon=cfg.RMSProp.epsilon)

        sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_get_weights = self.Op(sg_weights)
        self.op_apply_gradients = self.Ops(
            sg_gradients.apply,
            sg_global_step.increment,
            gradients=sg_gradients.ph_gradients,
            increment=sg_global_step.ph_increment)
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 4
0
    def build_graph(self):
        sg_network = Network()

        sg_get_weights_flatten = GetVariablesFlatten(sg_network.weights)
        sg_set_weights_flatten = SetVariablesFlatten(sg_network.weights)

        ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n'))

        sg_probtype = ProbType(trpo_config.config.output.action_size)

        ph_oldprob_np = sg_probtype.ProbVariable()

        sg_logp_n = sg_probtype.Loglikelihood(sg_network.actor)
        sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np)

        sg_surr = graph.TfNode(-tf.reduce_mean(tf.exp(sg_logp_n.node - sg_oldlogp_n.node) * ph_adv_n.node))

        sg_sum = tf.reduce_sum(sg_probtype.Kl(graph.TfNode(tf.stop_gradient(sg_network.actor.node)),
                                              sg_network.actor).node)
        sg_factor = tf.cast(tf.shape(sg_network.ph_state.node)[0], tf.float32)
        sg_kl_first_fixed = graph.TfNode(sg_sum / sg_factor)

        sg_kl = graph.TfNode(tf.reduce_mean(sg_probtype.Kl(ph_oldprob_np, sg_network.actor).node))

        sg_fvp = FisherVectorProduct(sg_kl_first_fixed, sg_network.weights)

        sg_ent = graph.TfNode(tf.reduce_mean(sg_probtype.Entropy(sg_network.actor).node))

        sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_surr)
        sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate)

        self.op_get_weights = self.Op(sg_network.weights)
        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_gradient = self.Op(sg_gradients_flatten, state=sg_network.ph_state,
                                           sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                           oldprob_np=ph_oldprob_np)

        self.op_losses = self.Ops(sg_surr, sg_kl, sg_ent, state=sg_network.ph_state,
                                  sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                  prob_variable=ph_oldprob_np)

        self.op_fisher_vector_product = self.Op(sg_fvp, tangent=sg_fvp.ph_tangent, state=sg_network.ph_state,
                                                sampled_variable=sg_probtype.ph_sampled_variable,
                                                adv_n=ph_adv_n, prob_variable=ph_oldprob_np)

        # PPO clipped surrogate loss
        # likelihood ration of old and new policy
        r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node)
        surr = r_theta * ph_adv_n.node
        clip_e = trpo_config.config.PPO.clip_e
        surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e,  1.0 + clip_e) * ph_adv_n.node
        sg_ppo_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped)))

        sg_minimize = graph.TfNode(tf.train.AdamOptimizer(
                learning_rate=trpo_config.config.PPO.learning_rate).minimize(sg_ppo_loss.node))
        self.op_ppo_optimize = self.Op(sg_minimize, state=sg_network.ph_state,
                                       sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                       oldprob_np=ph_oldprob_np)
Ejemplo n.º 5
0
    def build_graph(self):
        sg_global_step = graph.GlobalStep()
        sg_network = Network()

        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights)

        if config.use_linear_schedule:
            sg_learning_rate = lr_schedule.Linear(sg_global_step, config)
        else:
            sg_learning_rate = config.initial_learning_rate

        if config.optimizer == 'Adam':
            sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate)
        elif config.optimizer == 'RMSProp':
            sg_optimizer = optimizer.RMSPropOptimizer(
                learning_rate=sg_learning_rate,
                decay=config.RMSProp.decay,
                epsilon=config.RMSProp.epsilon)
        else:
            assert False, 'There 2 valid options for optimizers: Adam | RMSProp'

        sg_gradients_apply = optimizer.Gradients(sg_network.weights,
                                                 optimizer=sg_optimizer)

        sg_average_reward = graph.LinearMovingAverage(
            config.avg_in_num_batches)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_score = self.Op(sg_average_reward.average)

        self.op_get_weights_signed = self.Ops(sg_network.weights,
                                              sg_global_step.n)
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)

        self.op_apply_gradients = self.Ops(
            sg_gradients_apply.apply,
            sg_global_step.increment,
            gradients=sg_gradients_apply.ph_gradients,
            increment=sg_global_step.ph_increment)
        self.op_add_rewards_to_model_score_routine = self.Ops(
            sg_average_reward.add,
            reward_sum=sg_average_reward.ph_sum,
            reward_weight=sg_average_reward.ph_count)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(
            sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        # Gradient combining routines
        self.op_submit_gradients = self.Call(
            graph.get_gradients_apply_routine(config))

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 6
0
    def build_graph(self):
        self.sg_network = _WorkerNetwork()

        sg_loss = fun_graph.A3CLoss(self.sg_network.pi,
                                    self.sg_network.vi,
                                    entropy=False)
        sg_gradients = optimizer.Gradients(self.sg_network.weights,
                                           loss=sg_loss)

        # Expose public API
        self.op_assign_weights = self.Op(
            self.sg_network.weights.assign,
            weights=self.sg_network.weights.ph_weights)
        self.op_compute_gradients = \
            self.Op(sg_gradients.calculate,
                    ph_state=self.sg_network.ph_state,
                    ph_goal=self.sg_network.ph_goal,
                    ph_action=sg_loss.ph_action,
                    ph_value=sg_loss.ph_value,
                    ph_discounted_reward=sg_loss.ph_discounted_reward,
                    ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
                    ph_step_size=self.sg_network.ph_step_size)

        self.op_reset_lstm_state = self.Op(
            self.sg_network.lstm_state_out.assign_from_value)
        self.op_assign_lstm_state = self.Op(
            self.sg_network.lstm_state_out.assign_from_ph,
            ph_variable=self.sg_network.lstm_state)
        self.op_get_lstm_state = self.sg_network.lstm_state_out.node

        # without lstm state freezes
        self.op_get_zt = self.Op(self.sg_network.perception,
                                 ph_state=self.sg_network.ph_state)
        self.op_get_action_and_value = self.Ops(
            self.sg_network.pi,
            self.sg_network.vi,
            self.sg_network.lstm_state,
            ph_state=self.sg_network.ph_state,
            ph_goal=self.sg_network.ph_goal,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)
        self.op_get_action = self.Ops(  # use for exploitation
            self.sg_network.pi,
            self.sg_network.lstm_state,
            ph_state=self.sg_network.ph_state,
            ph_goal=self.sg_network.ph_goal,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)

        # with lstm state freezes
        self.op_get_value_zt = self.Ops(
            self.sg_network.perception,
            self.sg_network.vi,
            self.sg_network.lstm_state,
            ph_state=self.sg_network.ph_state,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)
Ejemplo n.º 7
0
    def build_graph(self):
        self.sg_network = _ManagerNetwork()

        sg_loss = fun_graph.CosineLoss(self.sg_network.goal,
                                       self.sg_network.value)
        sg_gradients = optimizer.Gradients(self.sg_network.weights,
                                           loss=sg_loss)

        # Expose public API
        self.op_assign_weights = self.Op(
            self.sg_network.weights.assign,
            ph_weights=self.sg_network.weights.ph_weights)
        self.op_compute_gradients =\
            self.Op(sg_gradients.calculate,
                    ph_perception=self.sg_network.ph_perception,
                    ph_stc_diff_st=sg_loss.ph_stc_diff_st,
                    ph_discounted_reward=sg_loss.ph_discounted_reward,
                    ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
                    ph_step_size=self.sg_network.ph_step_size)

        self.op_reset_lstm_state = self.Op(
            self.sg_network.lstm_state_out.assign_from_value)
        self.op_assign_lstm_state = self.Op(
            self.sg_network.lstm_state_out.assign_from_ph,
            ph_variable=self.sg_network.lstm_state)
        self.op_get_lstm_state = self.sg_network.lstm_state_out.node

        # without lstm state freezes
        self.op_get_goal_value_st = self.Ops(
            self.sg_network.goal,
            self.sg_network.value,
            self.sg_network.Mspace,
            self.sg_network.lstm_state,
            ph_perception=self.sg_network.ph_perception,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)
        self.op_get_st = self.Op(self.sg_network.Mspace,
                                 ph_perception=self.sg_network.ph_perception)

        # with lstm state freezes
        self.op_get_goal_st = self.Ops(
            self.sg_network.goal,
            self.sg_network.Mspace,
            self.sg_network.lstm_state,
            ph_perception=self.sg_network.ph_perception,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)
        self.op_get_value = self.Ops(
            self.sg_network.value,
            self.sg_network.lstm_state,
            ph_perception=self.sg_network.ph_perception,
            ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state,
            ph_step_size=self.sg_network.ph_step_size)
Ejemplo n.º 8
0
    def build_graph(self, weights):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_update_step = graph.GlobalStep()
        sg_weights = weights

        if dppo_config.config.use_linear_schedule:
            if dppo_config.config.schedule_step == 'update':
                sg_schedule_step = sg_update_step
            elif dppo_config.config.schedule_step == 'environment':
                sg_schedule_step = sg_global_step
            else:
                assert False, 'Valid options for the schedule step are: update OR environment.' \
                              'You provide the following option:'.format(dppo_config.config.schedule_step)
            sg_learning_rate = lr_schedule.Linear(sg_schedule_step, dppo_config.config)
        else:
            sg_learning_rate = dppo_config.config.initial_learning_rate

        sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate, epsilon=dppo_config.config.optimizer.epsilon)
        sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer)
        sg_average_reward = graph.LinearMovingAverage(dppo_config.config.avg_in_num_batches)
        sg_initialize = graph.Initialize()

        # Weights get/set for updating the policy
        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_weights)

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_upd_step = self.Op(sg_update_step.n)
        self.op_score = self.Op(sg_average_reward.average)

        self.op_inc_global_step = self.Ops(sg_global_step.increment, increment=sg_global_step.ph_increment)
        self.op_inc_global_step_and_average_reward = self.Ops(sg_global_step.increment,
                                                              sg_average_reward.add,
                                                              increment=sg_global_step.ph_increment,
                                                              reward_sum=sg_average_reward.ph_sum,
                                                              reward_weight=sg_average_reward.ph_count)

        self.op_get_weights = self.Op(sg_weights)
        self.op_get_weights_signed = self.Ops(sg_weights, sg_update_step.n)

        self.op_apply_gradients = self.Ops(sg_gradients.apply, sg_update_step.increment,
                                           gradients=sg_gradients.ph_gradients,
                                           increment=sg_update_step.ph_increment)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        # Gradient combining routines
        self.op_submit_gradients = self.Call(graph.get_gradients_apply_routine(dppo_config.config))

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 9
0
    def build_graph(self):
        input = layer.ConfiguredInput(trpo_config.config.input)
        # add one extra feature for timestep
        ph_step = graph.Placeholder(np.float32, shape=[None, 1])
        state = (input.ph_state, ph_step)

        concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1)

        activation = layer.Activation.get_activation(
            trpo_config.config.activation)
        head = layer.GenericLayers(concatenated, [
            dict(type=layer.Dense, size=size, activation=activation)
            for size in trpo_config.config.hidden_sizes
        ])
        value = layer.Dense(head, 1)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(
            tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(input, head, value)

        sg_get_weights_flatten = graph.GetVariablesFlatten(weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([
            tf.reduce_sum(tf.square(v))
            for v in utils.Utils.flatten(weights.node)
        ]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = graph.GetVariablesFlatten(
            sg_gradients.calculate)

        self.op_value = self.Op(value, state=state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(
            sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss,
                                                     sg_gradients_flatten,
                                                     state=state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss,
                                  mse,
                                  l2,
                                  state=state,
                                  ytarg_ny=ph_ytarg_ny)
Ejemplo n.º 10
0
    def build_graph(self):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_weights = Network().weights
        sg_optimizer = optimizer.AdamOptimizer(pg_config.config.learning_rate)
        sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_get_weights = self.Op(sg_weights)
        self.op_apply_gradients = self.Ops(sg_gradients.apply,
                                           sg_global_step.increment, gradients=sg_gradients.ph_gradients,
                                           increment=sg_global_step.ph_increment)
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 11
0
    def build_graph(self):
        # Build graph
        sg_network = Network()

        sg_loss = loss.PGLoss(action_size=pg_config.config.output.action_size,
                              network=sg_network)
        sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_loss)

        # Expose public API
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)
        self.op_get_action = self.Op(sg_network, state=sg_network.state)
        self.op_compute_gradients = self.Op(sg_gradients.calculate,
                                            state=sg_network.state, action=sg_loss.ph_action,
                                            discounted_reward=sg_loss.ph_discounted_reward)
Ejemplo n.º 12
0
    def build_graph(self):
        sg_network = Network()
        sg_target_network = Network()

        sg_get_action = Actor()

        sg_loss = loss.DQNLoss(sg_network.output, config)
        sg_gradients_calc = optimizer.Gradients(sg_network.weights,
                                                loss=sg_loss)

        sg_update_target_weights = graph.AssignWeights(
            sg_target_network.weights, sg_network.weights).op

        # Expose public API
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)
        self.op_assign_target_weights = self.Op(
            sg_target_network.weights.assign,
            target_weights=sg_target_network.weights.ph_weights)

        self.op_get_q_value = self.Op(sg_network.output.node,
                                      state=sg_network.ph_state)
        self.op_get_q_target_value = self.Op(
            sg_target_network.output.node,
            next_state=sg_target_network.ph_state)

        self.op_get_action = self.Op(sg_get_action,
                                     local_step=sg_get_action.ph_local_step,
                                     q_value=sg_get_action.ph_q_value)

        sg_initialize = graph.Initialize()

        feeds = dict(state=sg_network.ph_state,
                     reward=sg_loss.ph_reward,
                     action=sg_loss.ph_action,
                     terminal=sg_loss.ph_terminal,
                     q_next_target=sg_loss.ph_q_next_target,
                     q_next=sg_loss.ph_q_next)

        self.op_compute_gradients = self.Op(sg_gradients_calc.calculate,
                                            **feeds)

        self.op_update_target_weights = self.Op(sg_update_target_weights)

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 13
0
    def build_graph(self):
        sg_global_step = graph.GlobalStep()
        sg_network = Network()

        if config.optimizer == 'Adam':
            sg_optimizer = optimizer.AdamOptimizer(
                config.initial_learning_rate)
        elif config.optimizer == 'RMSProp':
            param = {}
            if hasattr(config, 'RMSProp'):
                if hasattr(config.RMSProp, "decay"):
                    param["decay"] = config.RMSProp.decay
                if hasattr(config.RMSProp, "epsilon"):
                    param["epsilon"] = config.RMSProp.epsilon

            sg_optimizer = optimizer.RMSPropOptimizer(
                config.initial_learning_rate, **param)
        else:
            raise NotImplementedError

        sg_gradients_apply = optimizer.Gradients(sg_network.weights,
                                                 optimizer=sg_optimizer)

        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)

        self.op_get_weights = self.Op(sg_network.weights)
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)

        self.op_apply_gradients = self.Ops(
            sg_gradients_apply.apply,
            sg_global_step.increment,
            gradients=sg_gradients_apply.ph_gradients,
            n_steps=sg_global_step.ph_increment)

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 14
0
    def build_graph(self):
        input_size, = trpo_config.config.input.shape

        # add one extra feature for timestep
        ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1))

        activation = layer.Activation.get_activation(trpo_config.config.activation)
        descs = [dict(type=layer.Dense, size=size, activation=activation) for size
                 in trpo_config.config.hidden_sizes]
        descs.append(dict(type=layer.Dense, size=1))

        value = layer.GenericLayers(ph_state, descs)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(value)

        sg_get_weights_flatten = GetVariablesFlatten(weights)
        sg_set_weights_flatten = SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                           utils.Utils.flatten(weights.node)]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate)

        self.op_value = self.Op(value, state=ph_state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
Ejemplo n.º 15
0
    def build_graph(self):
        sg_weights = _WorkerNetwork().weights

        sg_global_step = graph.GlobalStep()
        sg_learning_rate = fun_graph.LearningRate(sg_global_step)

        sg_optimizer = optimizer.RMSPropOptimizer(
            learning_rate=sg_learning_rate,
            decay=cfg.RMSProp.decay,
            momentum=0.0,
            epsilon=cfg.RMSProp.epsilon)

        sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_get_weights = self.Op(sg_weights)
        self.op_apply_gradients = self.Ops(
            sg_gradients.apply,
            sg_global_step.increment,
            gradients=sg_gradients.ph_gradients,
            increment=sg_global_step.ph_increment)
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 16
0
    def build_graph(self, sg_value_net):
        # 'Observed' value of a state = discounted reward
        vf_scale = dppo_config.config.critic_scale

        ph_ytarg_ny = graph.Placeholder(np.float32)
        v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node))

        if dppo_config.config.vf_clipped_loss:
            ph_old_vpred = graph.Placeholder(np.float32)
            clip_e = dppo_config.config.clip_e
            vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node,
                                                                -clip_e, clip_e)
            v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node))
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node)))
        else:
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node))

        if dppo_config.config.l2_coeff is not None:
            l2 = graph.TfNode(dppo_config.config.l2_coeff *
                              tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                        utils.Utils.flatten(sg_value_net.weights.node)]))

            sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node)
        else:
            sg_vf_total_loss = vf_mse

        sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss,
                                           norm=dppo_config.config.gradients_norm_clipping)
        sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate)

        # Op to compute value of a state
        if dppo_config.config.use_lstm:
            self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state,
                                     state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state)
            self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep)
        else:
            self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state)

        self.op_get_weights = self.Op(sg_value_net.weights)
        self.op_assign_weights = self.Op(sg_value_net.weights.assign,
                                         weights=sg_value_net.weights.ph_weights)

        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny)
        if dppo_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state))
        if dppo_config.config.vf_clipped_loss:
            feeds.update(dict(vpred_old=ph_old_vpred))

        self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds)
        if dppo_config.config.use_lstm:
            self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds)

        self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds)

        losses = [sg_vf_total_loss, vf_mse]
        if dppo_config.config.l2_coeff is not None:
            losses.append(l2)
        self.op_losses = self.Ops(*losses, **feeds)

        # Init Op for all weights
        sg_initialize = graph.Initialize()
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 17
0
    def build_graph(self, sg_network):
        if dppo_config.config.use_lstm:
            self.op_get_action = self.Ops(sg_network.head, sg_network.lstm_state,
                                          state=sg_network.ph_state, lstm_state=sg_network.ph_lstm_state)
            self.op_lstm_reset_timestep = self.Op(sg_network.lstm_reset_timestep)
        else:
            self.op_get_action = self.Op(sg_network.head, state=sg_network.ph_state)

        # Advantage node
        ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n'))

        # Contains placeholder for the actual action made by the agent
        sg_probtype = ProbType(dppo_config.config.output.action_size,
                               continuous=dppo_config.config.output.continuous)

        # Placeholder to store action probabilities under the old policy
        ph_oldprob_np = sg_probtype.ProbVariable()

        sg_logp_n = sg_probtype.Loglikelihood(sg_network.head)
        sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np)

        # PPO clipped surrogate loss
        # likelihood ratio of old and new policy
        r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node)
        surr = r_theta * ph_adv_n.node
        clip_e = dppo_config.config.clip_e
        surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node
        sg_pol_clip_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped)))

        # PPO entropy loss
        if dppo_config.config.entropy is not None:
            sg_entropy = sg_probtype.Entropy(sg_network.head)
            sg_ent_loss = (-dppo_config.config.entropy) * tf.reduce_mean(sg_entropy.node)
            sg_pol_total_loss = graph.TfNode(sg_pol_clip_loss.node + sg_ent_loss)
        else:
            sg_pol_total_loss = sg_pol_clip_loss

        # Regular gradients
        sg_ppo_clip_gradients = optimizer.Gradients(sg_network.weights, loss=sg_pol_total_loss,
                                                    norm=dppo_config.config.gradients_norm_clipping)
        feeds = dict(state=sg_network.ph_state, action=sg_probtype.ph_sampled_variable,
                     advantage=ph_adv_n, old_prob=ph_oldprob_np)
        if dppo_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_network.ph_lstm_state))

        self.op_compute_ppo_clip_gradients = self.Op(sg_ppo_clip_gradients.calculate, **feeds)
        if dppo_config.config.use_lstm:
            self.op_compute_ppo_clip_gradients = self.Ops(sg_ppo_clip_gradients.calculate,
                                                          sg_network.lstm_state, **feeds)

        # Weights get/set for updating the policy
        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights)

        self.op_get_weights = self.Op(sg_network.weights)
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        # Init Op for all weights
        sg_initialize = graph.Initialize()
        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 18
0
    def build_graph(self):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_episode_cnt = graph.GlobalStep()

        sg_actor_weights = ActorNetwork().weights
        sg_critic_weights = CriticNetwork().weights

        sg_actor_target_weights = ActorNetwork().weights
        sg_critic_target_weights = CriticNetwork().weights

        # needs reassign weights from actor & critic to target networks
        sg_init_actor_target_weights = \
            graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op
        sg_init_critic_target_weights = \
            graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op

        sg_update_actor_target_weights = \
            graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op
        sg_update_critic_target_weights = \
            graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op

        sg_actor_optimizer = optimizer.AdamOptimizer(
            cfg.config.actor_learning_rate)
        sg_critic_optimizer = optimizer.AdamOptimizer(
            cfg.config.critic_learning_rate)

        sg_actor_gradients = optimizer.Gradients(sg_actor_weights,
                                                 optimizer=sg_actor_optimizer)
        sg_critic_gradients = optimizer.Gradients(
            sg_critic_weights, optimizer=sg_critic_optimizer)

        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_get_weights = self.Ops(sg_actor_weights,
                                       sg_actor_target_weights,
                                       sg_critic_weights,
                                       sg_critic_target_weights)

        self.op_init_target_weights = self.Ops(sg_init_actor_target_weights,
                                               sg_init_critic_target_weights)

        self.op_update_target_weights = self.Ops(
            sg_update_actor_target_weights, sg_update_critic_target_weights)

        self.op_apply_actor_gradients = self.Ops(
            sg_actor_gradients.apply,
            sg_global_step.increment,
            gradients=sg_actor_gradients.ph_gradients,
            increment=sg_global_step.ph_increment)
        self.op_apply_critic_gradients = self.Op(
            sg_critic_gradients.apply,
            gradients=sg_critic_gradients.ph_gradients)

        self.op_n_step = self.Op(sg_global_step.n)
        self.op_inc_step = self.Op(sg_global_step.increment,
                                   increment=sg_global_step.ph_increment)

        self.op_get_episode_cnt = self.Op(sg_episode_cnt.n)
        self.op_inc_episode_cnt = self.Op(
            sg_episode_cnt.increment, increment=sg_episode_cnt.ph_increment)

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 19
0
    def build_graph(self):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_network = Network()
        sg_weights = sg_network.weights

        if da3c_config.config.use_linear_schedule:
            sg_learning_rate = lr_schedule.Linear(sg_global_step,
                                                  da3c_config.config)
        else:
            sg_learning_rate = da3c_config.config.initial_learning_rate

        if da3c_config.config.optimizer == 'Adam':
            sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate)
        else:
            sg_optimizer = optimizer.RMSPropOptimizer(
                learning_rate=sg_learning_rate,
                decay=da3c_config.config.RMSProp.decay,
                momentum=0.0,
                epsilon=da3c_config.config.RMSProp.epsilon)
        sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer)

        if da3c_config.config.use_icm:
            sg_icm_optimizer = optimizer.AdamOptimizer(
                da3c_config.config.icm.lr)
            sg_icm_weights = icm_model.ICM().weights
            sg_icm_gradients = optimizer.Gradients(sg_icm_weights,
                                                   optimizer=sg_icm_optimizer)

            # Expose ICM public API
            self.op_icm_get_weights = self.Op(sg_icm_weights)
            self.op_icm_apply_gradients = self.Op(
                sg_icm_gradients.apply,
                gradients=sg_icm_gradients.ph_gradients)

        sg_average_reward = graph.LinearMovingAverage(
            da3c_config.config.avg_in_num_batches)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_n_step = self.Op(sg_global_step.n)
        self.op_score = self.Op(sg_average_reward.average)

        self.op_check_weights = self.Op(sg_weights.check)
        self.op_get_weights = self.Ops(sg_weights, sg_global_step.n)

        self.op_apply_gradients = self.Ops(
            sg_gradients.apply,
            sg_global_step.increment,
            gradients=sg_gradients.ph_gradients,
            increment=sg_global_step.ph_increment)
        self.op_add_rewards_to_model_score_routine = self.Ops(
            sg_average_reward.add,
            reward_sum=sg_average_reward.ph_sum,
            reward_weight=sg_average_reward.ph_count)

        # Determine Gradients' applying methods: fifo (by default), averaging, delay compensation
        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_weights)
        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(
            sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_submit_gradients = self.Call(
            graph.get_gradients_apply_routine(da3c_config.config))

        self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 20
0
    def build_graph(self):
        # Build graph
        sg_network = Network()
        self.actor = sg_network.actor
        self.critic = sg_network.critic

        sg_loss = loss.DA3CLoss(sg_network.actor.head, sg_network.critic.head,
                                da3c_config.config)
        sg_actor_gradients = optimizer.Gradients(
            sg_network.actor.weights,
            loss=graph.TfNode(sg_loss.policy_loss),
            norm=da3c_config.config.gradients_norm_clipping)
        sg_critic_gradients = optimizer.Gradients(
            sg_network.critic.weights,
            loss=graph.TfNode(sg_loss.value_loss),
            norm=da3c_config.config.gradients_norm_clipping)

        if da3c_config.config.use_icm:
            sg_icm_network = icm_model.ICM()
            sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights,
                                                   loss=sg_icm_network.loss)

            # Expose ICM public API
            self.op_icm_assign_weights = self.Op(
                sg_icm_network.weights.assign,
                weights=sg_icm_network.weights.ph_weights)

            feeds = dict(state=sg_icm_network.ph_state,
                         probs=sg_icm_network.ph_probs)
            self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out,
                                                    **feeds)

            feeds.update(dict(action=sg_icm_network.ph_taken))
            self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate,
                                                    **feeds)

        summaries = tf.summary.merge([
            tf.summary.scalar('policy_loss', sg_loss.policy_loss),
            tf.summary.scalar('value_loss', sg_loss.value_loss),
            tf.summary.scalar('entropy', sg_loss.entropy),
            tf.summary.scalar('actor_gradients_global_norm',
                              sg_actor_gradients.global_norm),
            tf.summary.scalar('critic_gradients_global_norm',
                              sg_critic_gradients.global_norm),
            tf.summary.scalar('actor_weights_global_norm',
                              sg_network.actor.weights.global_norm),
            tf.summary.scalar('critic_weights_global_norm',
                              sg_network.critic.weights.global_norm)
        ])

        # Expose public API
        self.op_assign_weights = self.Ops(
            sg_network.actor.weights.assign,
            sg_network.critic.weights.assign,
            weights=(sg_network.actor.weights.ph_weights,
                     sg_network.critic.weights.ph_weights))

        feeds = dict(state=sg_network.ph_state,
                     action=sg_loss.ph_action,
                     advantage=sg_loss.ph_advantage,
                     discounted_reward=sg_loss.ph_discounted_reward)

        if da3c_config.config.use_lstm:
            feeds.update(
                dict(lstm_state=(sg_network.actor.ph_lstm_state,
                                 sg_network.critic.ph_lstm_state)))
            self.lstm_zero_state = (sg_network.actor.lstm_zero_state,
                                    sg_network.critic.lstm_zero_state)
            self.op_lstm_reset_timestep = self.Ops(
                sg_network.actor.lstm_reset_timestep,
                sg_network.critic.lstm_reset_timestep)
            self.op_get_action_value_and_lstm_state = \
                self.Ops(sg_network.actor.head, sg_network.critic.head,
                         (sg_network.actor.lstm_state, sg_network.critic.lstm_state),
                         state=sg_network.ph_state,
                         lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state))
        else:
            self.op_get_action_and_value = self.Ops(sg_network.actor.head,
                                                    sg_network.critic.head,
                                                    state=sg_network.ph_state)

        self.op_compute_gradients_and_summaries = \
            self.Ops((sg_actor_gradients.calculate, sg_critic_gradients.calculate), summaries, **feeds)
Ejemplo n.º 21
0
    def build_graph(self):
        # Build graph
        sg_actor_network = ActorNetwork()
        sg_critic_network = CriticNetwork()
        sg_actor_target_network = ActorNetwork()
        sg_critic_target_network = CriticNetwork()

        ph_action_gradient = graph.Placeholder(np.float32, (None, cfg.config.output.action_size))
        actor_grad_args = dict(loss=sg_actor_network.actor, grad_ys=-ph_action_gradient.node)

        if cfg.config.no_ps:
            sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate)
            actor_grad_args.update(dict(optimizer=sg_actor_optimizer))

        sg_actor_gradients = optimizer.Gradients(sg_actor_network.weights, **actor_grad_args)

        sg_critic_loss = loss.DDPGLoss(sg_critic_network, cfg.config)
        critic_grad_args = dict(loss=sg_critic_loss)

        if cfg.config.no_ps:
            sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate)
            critic_grad_args.update(dict(optimizer=sg_critic_optimizer))

        sg_critic_gradients = optimizer.Gradients(sg_critic_network.weights, **critic_grad_args)

        sg_critic_action_gradients = optimizer.Gradients(sg_critic_network.ph_action,
                                                         loss=sg_critic_network.critic)

        # Expose public API
        self.op_assign_actor_weights = self.Op(sg_actor_network.weights.assign,
                                               weights=sg_actor_network.weights.ph_weights)
        self.op_assign_critic_weights = self.Op(sg_critic_network.weights.assign,
                                                weights=sg_critic_network.weights.ph_weights)
        self.op_assign_actor_target_weights = self.Op(sg_actor_target_network.weights.assign,
                                                      weights=sg_actor_target_network.weights.ph_weights)
        self.op_assign_critic_target_weights = self.Op(sg_critic_target_network.weights.assign,
                                                       weights=sg_critic_target_network.weights.ph_weights)

        self.op_get_action = self.Op(sg_actor_network.actor,
                                     state=sg_actor_network.ph_state)
        self.op_get_critic_q = self.Op(sg_critic_network.critic,
                                       state=sg_critic_network.ph_state,
                                       action=sg_critic_network.ph_action)

        self.op_get_actor_target = self.Op(sg_actor_target_network.actor,
                                           state=sg_actor_target_network.ph_state)
        self.op_get_critic_target = self.Op(sg_critic_target_network.critic,
                                            state=sg_critic_target_network.ph_state,
                                            action=sg_critic_target_network.ph_action)

        self.op_compute_actor_gradients = self.Op(sg_actor_gradients.calculate,
                                                  state=sg_actor_network.ph_state,
                                                  grad_ys=ph_action_gradient)

        self.op_compute_critic_gradients = self.Op(sg_critic_gradients.calculate,
                                                   state=sg_critic_network.ph_state,
                                                   action=sg_critic_network.ph_action,
                                                   predicted=sg_critic_loss.ph_predicted)

        self.op_compute_critic_action_gradients = self.Op(sg_critic_action_gradients.calculate,
                                                          state=sg_critic_network.ph_state,
                                                          action=sg_critic_network.ph_action)

        # Integrated with grad computation by log_lvl
        self.op_critic_loss = self.Op(sg_critic_loss,
                                      state=sg_critic_network.ph_state,
                                      action=sg_critic_network.ph_action,
                                      predicted=sg_critic_loss.ph_predicted)
        self.op_compute_norm_actor_gradients = self.Op(sg_actor_gradients.global_norm,
                                                       state=sg_actor_network.ph_state,
                                                       grad_ys=ph_action_gradient)
        self.op_compute_norm_critic_gradients = self.Op(sg_critic_gradients.global_norm,
                                                        state=sg_critic_network.ph_state,
                                                        action=sg_critic_network.ph_action,
                                                        predicted=sg_critic_loss.ph_predicted)
        self.op_compute_norm_critic_action_gradients = self.Op(sg_critic_action_gradients.global_norm,
                                                               state=sg_critic_network.ph_state,
                                                               action=sg_critic_network.ph_action)

        if cfg.config.no_ps:
            sg_actor_weights = sg_actor_network.weights
            sg_critic_weights = sg_critic_network.weights

            sg_actor_target_weights = sg_actor_target_network.weights
            sg_critic_target_weights = sg_critic_target_network.weights

            # needs reassign weights from actor & critic to target networks
            sg_init_actor_target_weights = \
                graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op
            sg_init_critic_target_weights = \
                graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op

            sg_update_actor_target_weights = \
                graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op
            sg_update_critic_target_weights = \
                graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op

            self.op_get_weights = self.Ops(sg_actor_weights, sg_actor_target_weights,
                                           sg_critic_weights, sg_critic_target_weights)

            self.op_init_target_weights = self.Ops(sg_init_actor_target_weights,
                                                   sg_init_critic_target_weights)

            self.op_update_target_weights = self.Ops(sg_update_actor_target_weights,
                                                     sg_update_critic_target_weights)

            self.op_apply_actor_gradients = self.Ops(sg_actor_gradients.apply,
                                                     gradients=sg_actor_gradients.ph_gradients)
            self.op_apply_critic_gradients = self.Op(sg_critic_gradients.apply,
                                                     gradients=sg_critic_gradients.ph_gradients)
            sg_initialize = graph.Initialize()
            self.op_initialize = self.Op(sg_initialize)
Ejemplo n.º 22
0
    def build_graph(self):
        # Build graph
        sg_global_step = graph.GlobalStep()
        sg_episode_cnt = graph.GlobalStep()

        sg_actor_weights = ActorNetwork().weights
        sg_critic_weights = CriticNetwork().weights

        sg_actor_target_weights = ActorNetwork().weights
        sg_critic_target_weights = CriticNetwork().weights

        sg_get_weights_flatten = \
            graph.GetVariablesFlatten(graph.Variables(sg_actor_weights, sg_critic_weights))
        sg_set_weights_flatten = \
            graph.SetVariablesFlatten(graph.Variables(sg_actor_weights, sg_critic_weights))

        # needs reassign weights from actor & critic to target networks
        sg_init_actor_target_weights = \
            graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op
        sg_init_critic_target_weights = \
            graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op

        sg_update_actor_target_weights = \
            graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op
        sg_update_critic_target_weights = \
            graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op

        sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate)
        sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate)

        sg_actor_gradients = optimizer.Gradients(sg_actor_weights, optimizer=sg_actor_optimizer)
        sg_critic_gradients = optimizer.Gradients(sg_critic_weights, optimizer=sg_critic_optimizer)

        sg_average_reward = graph.LinearMovingAverage(cfg.config.avg_in_num_batches)
        sg_initialize = graph.Initialize()

        # Expose public API
        self.op_get_weights_signed = self.Ops(sg_actor_weights, sg_actor_target_weights,
                                              sg_critic_weights, sg_critic_target_weights, sg_global_step.n)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_init_target_weights = self.Ops(sg_init_actor_target_weights,
                                               sg_init_critic_target_weights)

        self.op_update_target_weights = self.Ops(sg_update_actor_target_weights,
                                                 sg_update_critic_target_weights)

        self.op_apply_gradients = self.Ops(sg_actor_gradients.apply, sg_critic_gradients.apply,
                                           sg_global_step.increment,
                                           gradients=(sg_actor_gradients.ph_gradients,
                                                      sg_critic_gradients.ph_gradients),
                                           increment=sg_global_step.ph_increment)
        self.op_add_rewards_to_model_score_routine = self.Ops(sg_average_reward.add,
                                                              reward_sum=sg_average_reward.ph_sum,
                                                              reward_weight=sg_average_reward.ph_count)
        self.op_score = self.Op(sg_average_reward.average)

        self.op_n_step = self.Op(sg_global_step.n)
        self.op_inc_step = self.Op(sg_global_step.increment, increment=sg_global_step.ph_increment)

        self.op_get_episode_cnt = self.Op(sg_episode_cnt.n)
        self.op_inc_episode_cnt = self.Op(sg_episode_cnt.increment, increment=sg_episode_cnt.ph_increment)

        self.op_submit_gradients = self.Call(graph.get_gradients_apply_routine(cfg.config))
        self.op_initialize = self.Op(sg_initialize)