Example #1
0
    def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32,
                                           shape=(None, actor.action_size),
                                           name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32,
                                              shape=(None, ),
                                              name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32,
                                                      shape=(None, ),
                                                      name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        log_std_dev = tf.log(sigma2)
        self.entropy = tf.reduce_mean(
            log_std_dev +
            tf.constant(0.5 * np.log(2. * np.pi * np.e), tf.float32))

        l2_dist = tf.square(self.ph_action.node - mu)
        sqr_std_dev = tf.constant(2.) * tf.square(sigma2) + tf.constant(1e-6)
        log_std_dev = tf.log(sigma2)
        log_prob = -l2_dist / sqr_std_dev - tf.constant(.5) * tf.log(
            tf.constant(2 * np.pi)) - log_std_dev

        self.policy_loss = -(tf.reduce_mean(
            tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node) +
                             cfg.entropy_beta * self.entropy)

        # Learning rate for the Critic is sized by critic_scale parameter
        self.value_loss = cfg.critic_scale * tf.reduce_mean(
            tf.square(self.ph_discounted_reward.node - critic.node))
Example #2
0
    def build_graph(self, actor, critic, entropy=True):
        self.ph_action = graph.Placeholder(np.int32, shape=(None, ), name="a")
        self.ph_value = graph.Placeholder(np.float32, shape=(None, ), name="v")
        self.ph_discounted_reward = graph.Placeholder(np.float32,
                                                      shape=(None, ),
                                                      name="r")

        action_one_hot = tf.one_hot(self.ph_action.node, cfg.action_size)

        # avoid NaN with getting the maximum with small value
        log_pi = tf.log(tf.maximum(actor.node, 1e-20))

        # policy entropy
        if entropy:
            entropy = -tf.reduce_sum(actor.node * log_pi, axis=1)
            # policy loss (output)  (Adding minus, because the original paper's
            # objective function is for gradient ascent, but we use gradient descent optimizer)
            policy_loss = -tf.reduce_sum(
                tf.reduce_sum(log_pi * action_one_hot, axis=1) *
                (self.ph_discounted_reward.node - self.ph_value.node) +
                entropy * cfg.entropy_beta)
        else:
            policy_loss = -tf.reduce_sum(
                tf.reduce_sum(log_pi * action_one_hot, axis=1) *
                (self.ph_discounted_reward.node - self.ph_value.node))

        # value loss (output)
        # (Learning rate for Critic is half of Actor's, it's l2 without dividing by 0.5)
        value_loss = tf.reduce_sum(
            tf.square(self.ph_discounted_reward.node - critic.node))

        # gradient of policy and value are summed up
        return policy_loss + value_loss
Example #3
0
    def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.int32,
                                           shape=(None, ),
                                           name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32,
                                              shape=(None, ),
                                              name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32,
                                                      shape=(None, ),
                                                      name="ph_edr")

        action_one_hot = tf.one_hot(self.ph_action.node, actor.action_size)

        # avoid NaN
        log_pi = tf.log(tf.maximum(actor.node, 1e-20))

        # policy entropy
        self.entropy = -tf.reduce_sum(actor.node * log_pi)

        # policy loss
        self.policy_loss = -(tf.reduce_sum(
            tf.reduce_sum(log_pi * action_one_hot, axis=1) *
            self.ph_advantage.node) + self.entropy * cfg.entropy_beta)

        # value loss
        self.value_loss = tf.reduce_sum(
            tf.square(self.ph_discounted_reward.node - critic.node))

        # gradient of policy and value are summed up
        # (Learning rate for the Critic is sized by critic_scale parameter)
        return self.policy_loss + cfg.critic_scale * self.value_loss
Example #4
0
    def build_graph(self):
        super(_WorkerNetwork, self).__init__()

        self.lstm = CustomBasicLSTMCell(cfg.d)  # d=256
        # needs wrap as layer to retrieve weights

        self.ph_goal =\
            graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_goal")
        # self.ph_goal = tf.placeholder(tf.float32, [None, cfg.d], name="ph_goal")

        perception_expanded = graph.Expand(self.perception.node, 0)

        self.ph_step_size = \
            graph.Placeholder(np.float32, shape=(1,), name="ph_w_step_size")
        # tf.placeholder(tf.float32, [1], name="ph_w_step_size")
        self.ph_initial_lstm_state = \
            graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_w_lstm_state")
        # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_w_lstm_state")

        lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
            self.lstm,
            perception_expanded,
            initial_state=self.ph_initial_lstm_state,
            sequence_length=self.ph_step_size,
            time_major=False)
        lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d])
        sg_lstm_outputs = graph.TfNode(lstm_outputs)

        U = layer.LinearLayer(sg_lstm_outputs,
                              shape=(cfg.d, cfg.action_size * cfg.k),
                              transformation=tf.matmul)
        U_embedding = tf.transpose(tf.reshape(U, [cfg.action_size, cfg.k, -1]))

        w = layer.LinearLayer(self.ph_goal,
                              shape=(cfg.d, cfg.k),
                              transformation=tf.matmul,
                              bias=False)
        w_reshaped = tf.reshape(w.node, [-1, 1, cfg.k])

        self.pi = layer.MatmulLayer(w_reshaped,
                                    U_embedding,
                                    activation=layer.Activation.Softmax)
        self.vi = layer.LinearLayer(sg_lstm_outputs,
                                    shape=(cfg.d, 1),
                                    transformation=tf.matmul)

        self.weights = layer.Weights(
            self.weights, graph.TfNode((self.lstm.matrix, self.lstm.bias)), U,
            w, self.vi)

        self.lstm_state_out =\
            graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]),
                                           dtype=np.float32, name="lstm_state_out"),
                            np.zeros([1, self.lstm.state_size]))
Example #5
0
    def build_graph(self, action_size, network):
        self.ph_action = graph.Placeholder(np.int32, (None, ))
        self.ph_discounted_reward = graph.Placeholder(np.float32, (None, 1))

        # making actions that gave good advantage (reward over time) more likely,
        # and actions that didn't less likely.

        log_like_op = tf.log(
            tf.reduce_sum(tf.one_hot(self.ph_action.node, action_size) *
                          network.node,
                          axis=[1]))
        return -tf.reduce_sum(log_like_op * self.ph_discounted_reward.node)
Example #6
0
    def build_graph(self):
        input = layer.ConfiguredInput(trpo_config.config.input)
        # add one extra feature for timestep
        ph_step = graph.Placeholder(np.float32, shape=[None, 1])
        state = (input.ph_state, ph_step)

        concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1)

        activation = layer.Activation.get_activation(
            trpo_config.config.activation)
        head = layer.GenericLayers(concatenated, [
            dict(type=layer.Dense, size=size, activation=activation)
            for size in trpo_config.config.hidden_sizes
        ])
        value = layer.Dense(head, 1)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(
            tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(input, head, value)

        sg_get_weights_flatten = graph.GetVariablesFlatten(weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([
            tf.reduce_sum(tf.square(v))
            for v in utils.Utils.flatten(weights.node)
        ]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = graph.GetVariablesFlatten(
            sg_gradients.calculate)

        self.op_value = self.Op(value, state=state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(
            sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss,
                                                     sg_gradients_flatten,
                                                     state=state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss,
                                  mse,
                                  l2,
                                  state=state,
                                  ytarg_ny=ph_ytarg_ny)
Example #7
0
    def build_graph(self, x, batch_size=1, n_units=256):
        self.phs = [
            graph.Placeholder(np.float32, [batch_size, n_units])
            for _ in range(2)
        ]
        self.ph_state = graph.TfNode(tuple(ph.node for ph in self.phs))
        self.ph_state.checked = tuple(ph.checked for ph in self.phs)

        self.zero_state = tuple(
            np.zeros([batch_size, n_units]) for _ in range(2))

        state = tf.contrib.rnn.LSTMStateTuple(*self.ph_state.checked)

        lstm = tf.contrib.rnn.BasicLSTMCell(n_units, state_is_tuple=True)

        outputs, self.state = tf.nn.dynamic_rnn(lstm,
                                                x.node,
                                                initial_state=state,
                                                sequence_length=tf.shape(
                                                    x.node)[1:2],
                                                time_major=False)

        self.state = graph.TfNode(self.state)
        self.weight = graph.TfNode(
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                              tf.get_variable_scope().name))
        return outputs
Example #8
0
    def build_graph(self):
        input = layer.Input(cfg.config.input)
        self.ph_action = graph.Placeholder(
            np.float32, (None, cfg.config.output.action_size))

        sizes = cfg.config.hidden_sizes
        assert len(
            sizes) > 1, 'You need to provide sizes at least for 2 layers'

        dense_1st = layer.Dense(layer.Flatten(input), sizes[0],
                                layer.Activation.Relu)
        dense_2nd = layer.DoubleDense(dense_1st, self.ph_action, sizes[1],
                                      layer.Activation.Relu)
        layers = [input, dense_1st, dense_2nd]

        net = layer.GenericLayers(dense_2nd, [
            dict(type=layer.Dense, size=size, activation=layer.Activation.Relu)
            for size in sizes[2:]
        ])
        if len(sizes[2:]) > 0:
            layers.append(net)

        self.critic = layer.Dense(net, 1, init_var=3e-3)
        self.ph_state = input.ph_state

        layers.append(self.critic)
        self.weights = layer.Weights(*layers)
Example #9
0
    def build_graph(self):
        # Build graph
        state = graph.Placeholder(np.float32, shape=(2, ))
        reverse = graph.TfNode(tf.reverse(state.node, [0]))

        # Expose public API
        self.op_get_action = self.Op(reverse, state=state)
Example #10
0
    def build_graph(self, goal, critic):
        self.ph_stc_diff_st =\
            graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_stc_diff_st")
        s_diff_normalized = tf.nn.l2_normalize(self.ph_stc_diff_st.node, dim=1)

        cosine_similarity = tf.matmul(s_diff_normalized,
                                      goal.node,
                                      transpose_b=True)
        cosine_similarity = tf.diag_part(cosine_similarity)

        # manager's advantage (R-V): R = ri + cfg.wGAMMA * R; AdvM = R - ViM
        self.ph_discounted_reward =\
            graph.Placeholder(np.float32, shape=(None,), name="ph_m_discounted_reward")
        advantage = self.ph_discounted_reward.node - critic.node

        manager_loss = tf.reduce_sum(advantage * cosine_similarity)
        return manager_loss
Example #11
0
    def build_graph(self):
        self.ph_perception =\
            graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_perception")
        # tf.placeholder(tf.float32, shape=[None, cfg.d], name="ph_perception")

        self.Mspace =\
            layer.Dense(self.ph_perception, cfg.d,  # d=256
                        activation=layer.Activation.Relu)
        Mspace_expanded = graph.Expand(self.Mspace, 0)

        self.lstm = DilatedLSTMCell(cfg.d, num_cores=cfg.d)
        # needs wrap as layer to retrieve weights

        self.ph_step_size =\
            graph.Placeholder(np.float32, shape=(1,), name="ph_m_step_size")
        # tf.placeholder(tf.float32, [1], name="ph_m_step_size")
        self.ph_initial_lstm_state =\
            graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_m_lstm_state")
        # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_m_lstm_state")

        lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
            self.lstm,
            Mspace_expanded,
            initial_state=self.ph_initial_lstm_state,
            sequence_length=self.ph_step_size,
            time_major=False)
        lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d])
        sg_lstm_outputs = graph.TfNode(lstm_outputs)

        self.goal = tf.nn.l2_normalize(graph.Flatten(sg_lstm_outputs), dim=1)

        critic = layer.Dense(sg_lstm_outputs, 1)
        self.value = layer.Flatten(critic)

        self.weights = layer.Weights(
            self.Mspace, graph.TfNode((self.lstm.matrix, self.lstm.bias)),
            critic)

        self.lstm_state_out =\
            graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]),
                                           dtype=np.float32, name="lstm_state_out"),
                            np.zeros([1, self.lstm.state_size]))
Example #12
0
    def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32,
                                           shape=(None, actor.action_size),
                                           name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32,
                                              shape=(None, ),
                                              name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32,
                                                      shape=(None, ),
                                                      name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        normal_dist = tf.contrib.distributions.Normal(mu, sigma2)
        log_prob = normal_dist.log_prob(self.ph_action.node)

        if cfg.entropy_type == 'Gauss':
            self.entropy = tf.reduce_mean(normal_dist.entropy())
        elif cfg.entropy_type == 'Origin':
            self.entropy = tf.reduce_mean(-0.5 *
                                          (tf.log(2 * np.pi * sigma2) + 1.0))
        else:
            assert False, 'You should provide entropy type from 2 variants: Gauss or Origin'

        self.policy_loss = -(tf.reduce_mean(
            tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node) +
                             cfg.entropy_beta * self.entropy)
        if cfg.policy_clip:
            self.policy_loss = tf.clip_by_value(self.policy_loss,
                                                -tf.abs(cfg.policy_clip),
                                                tf.abs(cfg.policy_clip))

        # Learning rate for the Critic is sized by critic_scale parameter
        self.value_loss = cfg.critic_scale * tf.reduce_mean(
            tf.square(self.ph_discounted_reward.node - critic.node))
        if cfg.critic_clip:
            self.value_loss = tf.clip_by_value(self.value_loss,
                                               -tf.abs(cfg.critic_clip),
                                               tf.abs(cfg.critic_clip))
Example #13
0
    def build_graph(self, x, batch_size, n_units, n_cores):
        lstm = graph.DilatedLSTMCell(n_units, n_cores)

        self.ph_state = graph.Placeholder(np.float32, [batch_size, lstm.state_size])
        self.zero_state = np.zeros([batch_size, lstm.state_size])

        outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=self.ph_state.checked,
                                                sequence_length=tf.shape(x.node)[1:2], time_major=False)

        self.state = graph.TfNode(self.state)
        self.weight = graph.TfNode([lstm.matrix, lstm.bias])
        self.reset_timestep = graph.TfNode(lstm.reset_timestep)
        return outputs
Example #14
0
    def build_graph(self):
        conv_layer = dict(type=layer.Convolution, activation=layer.Activation.Elu,
                          n_filters=32, filter_size=[3, 3], stride=[2, 2],
                          border=layer.Border.Same)
        input = layer.Input(cfg.config.input, descs=[dict(conv_layer)] * 4)

        shape = [None] + [cfg.config.output.action_size]
        self.ph_probs = graph.Placeholder(np.float32, shape=shape, name='act_probs')
        self.ph_taken = graph.Placeholder(np.int32, shape=(None,), name='act_taken')

        flattened_input = layer.Flatten(input)
        last_size = flattened_input.node.shape.as_list()[-1]

        inverse_inp = graph.Reshape(input, [-1, last_size*2])

        get_first = graph.TfNode(inverse_inp.node[:, :last_size])
        get_second = graph.TfNode(inverse_inp.node[:, last_size:])

        forward_inp = graph.Concat([get_first, self.ph_probs], axis=1)

        fc_size = cfg.config.hidden_sizes[-1]
        inv_fc1 = layer.Dense(inverse_inp, fc_size, layer.Activation.Relu)
        inv_fc2 = layer.Dense(inv_fc1, shape[-1])   # layer.Activation.Softmax

        fwd_fc1 = layer.Dense(forward_inp, fc_size, layer.Activation.Relu)
        fwd_fc2 = layer.Dense(fwd_fc1, last_size)

        inv_loss = graph.SparseSoftmaxCrossEntropyWithLogits(inv_fc2, self.ph_taken).op
        fwd_loss = graph.L2loss(fwd_fc2.node - get_second.node).op

        self.ph_state = input.ph_state  # should be even wrt to batch_size for now
        self.rew_out = graph.TfNode(cfg.config.icm.nu * fwd_loss)

        self.loss = graph.TfNode(cfg.config.icm.beta * fwd_loss + (1 - cfg.config.icm.beta) * inv_loss)

        layers = [input, inv_fc1, inv_fc2, fwd_fc1, fwd_fc2]
        self.weights = layer.Weights(*layers)
Example #15
0
    def build_graph(self):
        input_size, = trpo_config.config.input.shape

        # add one extra feature for timestep
        ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1))

        activation = layer.Activation.get_activation(trpo_config.config.activation)
        descs = [dict(type=layer.Dense, size=size, activation=activation) for size
                 in trpo_config.config.hidden_sizes]
        descs.append(dict(type=layer.Dense, size=1))

        value = layer.GenericLayers(ph_state, descs)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(value)

        sg_get_weights_flatten = GetVariablesFlatten(weights)
        sg_set_weights_flatten = SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                           utils.Utils.flatten(weights.node)]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate)

        self.op_value = self.Op(value, state=ph_state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
Example #16
0
    def build_graph(self, actor, critic, cfg):
        self.ph_action = graph.Placeholder(np.float32,
                                           shape=(None, actor.action_size),
                                           name="ph_action")
        self.ph_advantage = graph.Placeholder(np.float32,
                                              shape=(None, ),
                                              name="ph_adv")
        self.ph_discounted_reward = graph.Placeholder(np.float32,
                                                      shape=(None, ),
                                                      name="ph_edr")

        mu, sigma2 = actor.node
        sigma2 += tf.constant(1e-8)

        # policy entropy
        self.entropy = -tf.reduce_mean(0.5 *
                                       (tf.log(2. * np.pi * sigma2) + 1.))

        # policy loss (calculation)
        b_size = tf.to_float(tf.size(self.ph_action.node) / actor.action_size)
        log_pi = tf.log(sigma2)
        x_prec = tf.exp(-log_pi)
        x_diff = tf.subtract(self.ph_action.node, mu)
        x_power = tf.square(x_diff) * x_prec * -0.5
        gaussian_nll = (tf.reduce_sum(log_pi, axis=1) +
                        b_size * tf.log(2. * np.pi)) / 2. - tf.reduce_sum(
                            x_power, axis=1)

        self.policy_loss = -(
            tf.reduce_mean(gaussian_nll * self.ph_advantage.node) +
            cfg.entropy_beta * self.entropy)

        # value loss
        # (Learning rate for the Critic is sized by critic_scale parameter)
        self.value_loss = cfg.critic_scale * tf.reduce_mean(
            tf.square(self.ph_discounted_reward.node - critic.node))
Example #17
0
    def build_graph(self, kl_first_fixed, weights):
        weight_list = list(utils.Utils.flatten(weights.node))
        gradients1 = tf.gradients(kl_first_fixed.node, weight_list)
        ph_tangent = graph.Placeholder(np.float32, shape=(None,))

        gvp = []
        start = 0
        for g in gradients1:
            size = np.prod(g.shape.as_list())
            gvp.append(tf.reduce_sum(tf.reshape(g, [-1]) * ph_tangent.node[start:start + size]))
            start += size

        gradients2 = tf.gradients(gvp, weight_list)
        fvp = tf.concat([tf.reshape(g, [-1]) for g in gradients2], axis=0)

        self.ph_tangent = ph_tangent
        return fvp
Example #18
0
    def build_graph(self, input):
        if hasattr(input, 'shape'):
            input_shape = input.shape
        else:
            input_shape = input.image

        if np.prod(input_shape) == 0:
            input_shape = [1]
        shape = [None] + input_shape + [input.history]
        self.ph_state = graph.Placeholder(np.float32, shape=shape)

        if len(shape) <= 4:
            state_input = self.ph_state.checked
        else:
            # move channels after history
            perm = list(range(len(shape)))
            perm = perm[0:3] + perm[-1:] + perm[3:-1]
            transpose = tf.transpose(self.ph_state.checked, perm=perm)

            # mix history and channels in one dimension
            state_input = tf.reshape(transpose, [-1] + shape[1:3] + [np.prod(shape[3:])])

        return state_input
Example #19
0
    def build_graph(self, sg_value_net):
        # 'Observed' value of a state = discounted reward
        vf_scale = dppo_config.config.critic_scale

        ph_ytarg_ny = graph.Placeholder(np.float32)
        v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node))

        if dppo_config.config.vf_clipped_loss:
            ph_old_vpred = graph.Placeholder(np.float32)
            clip_e = dppo_config.config.clip_e
            vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node,
                                                                -clip_e, clip_e)
            v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node))
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node)))
        else:
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node))

        if dppo_config.config.l2_coeff is not None:
            l2 = graph.TfNode(dppo_config.config.l2_coeff *
                              tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                        utils.Utils.flatten(sg_value_net.weights.node)]))

            sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node)
        else:
            sg_vf_total_loss = vf_mse

        sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss,
                                           norm=dppo_config.config.gradients_norm_clipping)
        sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate)

        # Op to compute value of a state
        if dppo_config.config.use_lstm:
            self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state,
                                     state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state)
            self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep)
        else:
            self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state)

        self.op_get_weights = self.Op(sg_value_net.weights)
        self.op_assign_weights = self.Op(sg_value_net.weights.assign,
                                         weights=sg_value_net.weights.ph_weights)

        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny)
        if dppo_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state))
        if dppo_config.config.vf_clipped_loss:
            feeds.update(dict(vpred_old=ph_old_vpred))

        self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds)
        if dppo_config.config.use_lstm:
            self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds)

        self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds)

        losses = [sg_vf_total_loss, vf_mse]
        if dppo_config.config.l2_coeff is not None:
            losses.append(l2)
        self.op_losses = self.Ops(*losses, **feeds)

        # Init Op for all weights
        sg_initialize = graph.Initialize()
        self.op_initialize = self.Op(sg_initialize)
Example #20
0
    def build_graph(self):
        # Build graph
        sg_actor_network = ActorNetwork()
        sg_critic_network = CriticNetwork()
        sg_actor_target_network = ActorNetwork()
        sg_critic_target_network = CriticNetwork()

        ph_action_gradient = graph.Placeholder(np.float32, (None, cfg.config.output.action_size))
        actor_grad_args = dict(loss=sg_actor_network.actor, grad_ys=-ph_action_gradient.node)

        if cfg.config.no_ps:
            sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate)
            actor_grad_args.update(dict(optimizer=sg_actor_optimizer))

        sg_actor_gradients = optimizer.Gradients(sg_actor_network.weights, **actor_grad_args)

        sg_critic_loss = loss.DDPGLoss(sg_critic_network, cfg.config)
        critic_grad_args = dict(loss=sg_critic_loss)

        if cfg.config.no_ps:
            sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate)
            critic_grad_args.update(dict(optimizer=sg_critic_optimizer))

        sg_critic_gradients = optimizer.Gradients(sg_critic_network.weights, **critic_grad_args)

        sg_critic_action_gradients = optimizer.Gradients(sg_critic_network.ph_action,
                                                         loss=sg_critic_network.critic)

        # Expose public API
        self.op_assign_actor_weights = self.Op(sg_actor_network.weights.assign,
                                               weights=sg_actor_network.weights.ph_weights)
        self.op_assign_critic_weights = self.Op(sg_critic_network.weights.assign,
                                                weights=sg_critic_network.weights.ph_weights)
        self.op_assign_actor_target_weights = self.Op(sg_actor_target_network.weights.assign,
                                                      weights=sg_actor_target_network.weights.ph_weights)
        self.op_assign_critic_target_weights = self.Op(sg_critic_target_network.weights.assign,
                                                       weights=sg_critic_target_network.weights.ph_weights)

        self.op_get_action = self.Op(sg_actor_network.actor,
                                     state=sg_actor_network.ph_state)
        self.op_get_critic_q = self.Op(sg_critic_network.critic,
                                       state=sg_critic_network.ph_state,
                                       action=sg_critic_network.ph_action)

        self.op_get_actor_target = self.Op(sg_actor_target_network.actor,
                                           state=sg_actor_target_network.ph_state)
        self.op_get_critic_target = self.Op(sg_critic_target_network.critic,
                                            state=sg_critic_target_network.ph_state,
                                            action=sg_critic_target_network.ph_action)

        self.op_compute_actor_gradients = self.Op(sg_actor_gradients.calculate,
                                                  state=sg_actor_network.ph_state,
                                                  grad_ys=ph_action_gradient)

        self.op_compute_critic_gradients = self.Op(sg_critic_gradients.calculate,
                                                   state=sg_critic_network.ph_state,
                                                   action=sg_critic_network.ph_action,
                                                   predicted=sg_critic_loss.ph_predicted)

        self.op_compute_critic_action_gradients = self.Op(sg_critic_action_gradients.calculate,
                                                          state=sg_critic_network.ph_state,
                                                          action=sg_critic_network.ph_action)

        # Integrated with grad computation by log_lvl
        self.op_critic_loss = self.Op(sg_critic_loss,
                                      state=sg_critic_network.ph_state,
                                      action=sg_critic_network.ph_action,
                                      predicted=sg_critic_loss.ph_predicted)
        self.op_compute_norm_actor_gradients = self.Op(sg_actor_gradients.global_norm,
                                                       state=sg_actor_network.ph_state,
                                                       grad_ys=ph_action_gradient)
        self.op_compute_norm_critic_gradients = self.Op(sg_critic_gradients.global_norm,
                                                        state=sg_critic_network.ph_state,
                                                        action=sg_critic_network.ph_action,
                                                        predicted=sg_critic_loss.ph_predicted)
        self.op_compute_norm_critic_action_gradients = self.Op(sg_critic_action_gradients.global_norm,
                                                               state=sg_critic_network.ph_state,
                                                               action=sg_critic_network.ph_action)

        if cfg.config.no_ps:
            sg_actor_weights = sg_actor_network.weights
            sg_critic_weights = sg_critic_network.weights

            sg_actor_target_weights = sg_actor_target_network.weights
            sg_critic_target_weights = sg_critic_target_network.weights

            # needs reassign weights from actor & critic to target networks
            sg_init_actor_target_weights = \
                graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op
            sg_init_critic_target_weights = \
                graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op

            sg_update_actor_target_weights = \
                graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op
            sg_update_critic_target_weights = \
                graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op

            self.op_get_weights = self.Ops(sg_actor_weights, sg_actor_target_weights,
                                           sg_critic_weights, sg_critic_target_weights)

            self.op_init_target_weights = self.Ops(sg_init_actor_target_weights,
                                                   sg_init_critic_target_weights)

            self.op_update_target_weights = self.Ops(sg_update_actor_target_weights,
                                                     sg_update_critic_target_weights)

            self.op_apply_actor_gradients = self.Ops(sg_actor_gradients.apply,
                                                     gradients=sg_actor_gradients.ph_gradients)
            self.op_apply_critic_gradients = self.Op(sg_critic_gradients.apply,
                                                     gradients=sg_critic_gradients.ph_gradients)
            sg_initialize = graph.Initialize()
            self.op_initialize = self.Op(sg_initialize)