Example #1
0
    def build_graph(self,
                    weights,
                    loss=None,
                    optimizer=None,
                    norm=False,
                    batch_size=None,
                    grad_ys=None):
        if loss is not None:
            gradients = tf.gradients(loss.node,
                                     list(utils.Utils.flatten(weights.node)),
                                     grad_ys)
            gradients = [
                tf.check_numerics(g, 'gradient_%d' % i)
                for i, g in enumerate(gradients)
            ]
            if batch_size is not None:
                gradients = [g / float(batch_size) for g in gradients]

            # store gradients global norm before clipping
            self.global_norm = tf.global_norm(gradients)

            # clip gradients after global norm has been stored
            if norm:
                gradients, _ = tf.clip_by_global_norm(gradients, norm)
            self.calculate = graph.TfNode(
                utils.Utils.reconstruct(gradients, weights.node))
        if optimizer is not None:
            self.ph_gradients = graph.Placeholders(weights)
            self.apply = graph.TfNode(
                optimizer.node.apply_gradients(
                    utils.Utils.izip(self.ph_gradients.checked, weights.node)))
Example #2
0
    def build_graph(self, x, batch_size=1, n_units=256):
        self.phs = [
            graph.Placeholder(np.float32, [batch_size, n_units])
            for _ in range(2)
        ]
        self.ph_state = graph.TfNode(tuple(ph.node for ph in self.phs))
        self.ph_state.checked = tuple(ph.checked for ph in self.phs)

        self.zero_state = tuple(
            np.zeros([batch_size, n_units]) for _ in range(2))

        state = tf.contrib.rnn.LSTMStateTuple(*self.ph_state.checked)

        lstm = tf.contrib.rnn.BasicLSTMCell(n_units, state_is_tuple=True)

        outputs, self.state = tf.nn.dynamic_rnn(lstm,
                                                x.node,
                                                initial_state=state,
                                                sequence_length=tf.shape(
                                                    x.node)[1:2],
                                                time_major=False)

        self.state = graph.TfNode(self.state)
        self.weight = graph.TfNode(
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                              tf.get_variable_scope().name))
        return outputs
Example #3
0
 def build_graph(self, *layers):
     weights = [layer.weight.node for layer in layers]
     self.ph_weights = graph.Placeholders(variables=graph.TfNode(weights))
     self.assign = graph.TfNode([tf.assign(variable, value) for variable, value in
                                 utils.Utils.izip(weights, self.ph_weights.checked)])
     self.check = graph.TfNode(tf.group(*[tf.check_numerics(w, 'weight_%d' % i) for i, w in
                                          enumerate(utils.Utils.flatten(weights))]))
     self.global_norm = tf.global_norm(list(utils.Utils.flatten(weights)))
     return weights
Example #4
0
    def build_graph(self):
        super(_WorkerNetwork, self).__init__()

        self.lstm = CustomBasicLSTMCell(cfg.d)  # d=256
        # needs wrap as layer to retrieve weights

        self.ph_goal =\
            graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_goal")
        # self.ph_goal = tf.placeholder(tf.float32, [None, cfg.d], name="ph_goal")

        perception_expanded = graph.Expand(self.perception.node, 0)

        self.ph_step_size = \
            graph.Placeholder(np.float32, shape=(1,), name="ph_w_step_size")
        # tf.placeholder(tf.float32, [1], name="ph_w_step_size")
        self.ph_initial_lstm_state = \
            graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_w_lstm_state")
        # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_w_lstm_state")

        lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
            self.lstm,
            perception_expanded,
            initial_state=self.ph_initial_lstm_state,
            sequence_length=self.ph_step_size,
            time_major=False)
        lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d])
        sg_lstm_outputs = graph.TfNode(lstm_outputs)

        U = layer.LinearLayer(sg_lstm_outputs,
                              shape=(cfg.d, cfg.action_size * cfg.k),
                              transformation=tf.matmul)
        U_embedding = tf.transpose(tf.reshape(U, [cfg.action_size, cfg.k, -1]))

        w = layer.LinearLayer(self.ph_goal,
                              shape=(cfg.d, cfg.k),
                              transformation=tf.matmul,
                              bias=False)
        w_reshaped = tf.reshape(w.node, [-1, 1, cfg.k])

        self.pi = layer.MatmulLayer(w_reshaped,
                                    U_embedding,
                                    activation=layer.Activation.Softmax)
        self.vi = layer.LinearLayer(sg_lstm_outputs,
                                    shape=(cfg.d, 1),
                                    transformation=tf.matmul)

        self.weights = layer.Weights(
            self.weights, graph.TfNode((self.lstm.matrix, self.lstm.bias)), U,
            w, self.vi)

        self.lstm_state_out =\
            graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]),
                                           dtype=np.float32, name="lstm_state_out"),
                            np.zeros([1, self.lstm.state_size]))
Example #5
0
    def build_graph(self, x, batch_size, n_units, n_cores):
        lstm = graph.DilatedLSTMCell(n_units, n_cores)

        self.ph_state = graph.Placeholder(np.float32, [batch_size, lstm.state_size])
        self.zero_state = np.zeros([batch_size, lstm.state_size])

        outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=self.ph_state.checked,
                                                sequence_length=tf.shape(x.node)[1:2], time_major=False)

        self.state = graph.TfNode(self.state)
        self.weight = graph.TfNode([lstm.matrix, lstm.bias])
        self.reset_timestep = graph.TfNode(lstm.reset_timestep)
        return outputs
Example #6
0
    def build_graph(self):
        input = layer.ConfiguredInput(trpo_config.config.input)
        # add one extra feature for timestep
        ph_step = graph.Placeholder(np.float32, shape=[None, 1])
        state = (input.ph_state, ph_step)

        concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1)

        activation = layer.Activation.get_activation(
            trpo_config.config.activation)
        head = layer.GenericLayers(concatenated, [
            dict(type=layer.Dense, size=size, activation=activation)
            for size in trpo_config.config.hidden_sizes
        ])
        value = layer.Dense(head, 1)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(
            tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(input, head, value)

        sg_get_weights_flatten = graph.GetVariablesFlatten(weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([
            tf.reduce_sum(tf.square(v))
            for v in utils.Utils.flatten(weights.node)
        ]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = graph.GetVariablesFlatten(
            sg_gradients.calculate)

        self.op_value = self.Op(value, state=state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(
            sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss,
                                                     sg_gradients_flatten,
                                                     state=state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss,
                                  mse,
                                  l2,
                                  state=state,
                                  ytarg_ny=ph_ytarg_ny)
Example #7
0
    def build_graph(self):
        sg_network = Network()

        sg_get_weights_flatten = GetVariablesFlatten(sg_network.weights)
        sg_set_weights_flatten = SetVariablesFlatten(sg_network.weights)

        ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n'))

        sg_probtype = ProbType(trpo_config.config.output.action_size)

        ph_oldprob_np = sg_probtype.ProbVariable()

        sg_logp_n = sg_probtype.Loglikelihood(sg_network.actor)
        sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np)

        sg_surr = graph.TfNode(-tf.reduce_mean(tf.exp(sg_logp_n.node - sg_oldlogp_n.node) * ph_adv_n.node))

        sg_sum = tf.reduce_sum(sg_probtype.Kl(graph.TfNode(tf.stop_gradient(sg_network.actor.node)),
                                              sg_network.actor).node)
        sg_factor = tf.cast(tf.shape(sg_network.ph_state.node)[0], tf.float32)
        sg_kl_first_fixed = graph.TfNode(sg_sum / sg_factor)

        sg_kl = graph.TfNode(tf.reduce_mean(sg_probtype.Kl(ph_oldprob_np, sg_network.actor).node))

        sg_fvp = FisherVectorProduct(sg_kl_first_fixed, sg_network.weights)

        sg_ent = graph.TfNode(tf.reduce_mean(sg_probtype.Entropy(sg_network.actor).node))

        sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_surr)
        sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate)

        self.op_get_weights = self.Op(sg_network.weights)
        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_gradient = self.Op(sg_gradients_flatten, state=sg_network.ph_state,
                                           sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                           oldprob_np=ph_oldprob_np)

        self.op_losses = self.Ops(sg_surr, sg_kl, sg_ent, state=sg_network.ph_state,
                                  sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                  prob_variable=ph_oldprob_np)

        self.op_fisher_vector_product = self.Op(sg_fvp, tangent=sg_fvp.ph_tangent, state=sg_network.ph_state,
                                                sampled_variable=sg_probtype.ph_sampled_variable,
                                                adv_n=ph_adv_n, prob_variable=ph_oldprob_np)

        # PPO clipped surrogate loss
        # likelihood ration of old and new policy
        r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node)
        surr = r_theta * ph_adv_n.node
        clip_e = trpo_config.config.PPO.clip_e
        surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e,  1.0 + clip_e) * ph_adv_n.node
        sg_ppo_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped)))

        sg_minimize = graph.TfNode(tf.train.AdamOptimizer(
                learning_rate=trpo_config.config.PPO.learning_rate).minimize(sg_ppo_loss.node))
        self.op_ppo_optimize = self.Op(sg_minimize, state=sg_network.ph_state,
                                       sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n,
                                       oldprob_np=ph_oldprob_np)
Example #8
0
    def build_graph(self, head, output):
        self.action_size = output.action_size
        self.continuous = True

        self.out = Dense(head, self.action_size, activation=Activation.Tanh, init_var=3e-3)
        self.weight = self.out.weight
        self.scaled_out = graph.TfNode(self.out.node * output.scale)
Example #9
0
    def build_graph(self, x1, x2, size=1, activation=Activation.Null):
        assert len(x1.node.shape) == 2
        shape1 = (x1.node.shape.as_list()[1], size)
        assert len(x2.node.shape) == 2
        shape2 = (x2.node.shape.as_list()[1], size)

        d = 1.0
        p = np.prod(shape1[:-1])
        if p != 0:
            d = 1.0 / np.sqrt(p)
        initializer = graph.RandomUniformInitializer(minval=-d, maxval=d)
        W1 = graph.Variable(initializer(np.float32, shape1)).node

        d = 1.0
        p = np.prod(shape2[:-1])
        if p != 0:
            d = 1.0 / np.sqrt(p)
        initializer = graph.RandomUniformInitializer(minval=-d, maxval=d)
        W2 = graph.Variable(initializer(np.float32, shape2)).node

        initializer = graph.RandomUniformInitializer()
        b = graph.Variable(initializer(np.float32, shape2[-1:])).node

        activation = activation(tf.matmul(x1.node, W1) + tf.matmul(x2.node, W2) + b)
        self.weight = graph.TfNode((W1, W2, b, activation.weight))
        return activation.node
Example #10
0
    def build_graph(self):
        # Build graph
        state = graph.Placeholder(np.float32, shape=(2, ))
        reverse = graph.TfNode(tf.reverse(state.node, [0]))

        # Expose public API
        self.op_get_action = self.Op(reverse, state=state)
Example #11
0
    def build_graph(self, x):
        input_dim = x.node.get_shape().as_list()[1]
        logstd = tf.Variable(tf.zeros(input_dim, tf.float32))

        std = tf.tile(tf.reshape(tf.exp(logstd), [1, -1]), (tf.shape(x.node)[0], 1))

        self.weight = graph.TfNode(logstd)
        return tf.concat([x.node, std], axis=1)
Example #12
0
    def build_graph(self):
        self.ph_perception =\
            graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_perception")
        # tf.placeholder(tf.float32, shape=[None, cfg.d], name="ph_perception")

        self.Mspace =\
            layer.Dense(self.ph_perception, cfg.d,  # d=256
                        activation=layer.Activation.Relu)
        Mspace_expanded = graph.Expand(self.Mspace, 0)

        self.lstm = DilatedLSTMCell(cfg.d, num_cores=cfg.d)
        # needs wrap as layer to retrieve weights

        self.ph_step_size =\
            graph.Placeholder(np.float32, shape=(1,), name="ph_m_step_size")
        # tf.placeholder(tf.float32, [1], name="ph_m_step_size")
        self.ph_initial_lstm_state =\
            graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_m_lstm_state")
        # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_m_lstm_state")

        lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(
            self.lstm,
            Mspace_expanded,
            initial_state=self.ph_initial_lstm_state,
            sequence_length=self.ph_step_size,
            time_major=False)
        lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d])
        sg_lstm_outputs = graph.TfNode(lstm_outputs)

        self.goal = tf.nn.l2_normalize(graph.Flatten(sg_lstm_outputs), dim=1)

        critic = layer.Dense(sg_lstm_outputs, 1)
        self.value = layer.Flatten(critic)

        self.weights = layer.Weights(
            self.Mspace, graph.TfNode((self.lstm.matrix, self.lstm.bias)),
            critic)

        self.lstm_state_out =\
            graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]),
                                           dtype=np.float32, name="lstm_state_out"),
                            np.zeros([1, self.lstm.state_size]))
Example #13
0
 def build_graph(self, x, shape, transformation, activation, d=None):
     if d is None:
         d = 1.0
         p = np.prod(shape[:-1])
         if p != 0:
             d = 1.0 / np.sqrt(p)
     initializer = graph.RandomUniformInitializer(minval=-d, maxval=d)
     W = graph.Variable(initializer(np.float32, shape)).node
     b = graph.Variable(initializer(np.float32, shape[-1:])).node
     self.weight = graph.TfNode((W, b))
     return activation(transformation(x, W) + b)
Example #14
0
    def build_graph(self):
        input_size, = trpo_config.config.input.shape

        # add one extra feature for timestep
        ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1))

        activation = layer.Activation.get_activation(trpo_config.config.activation)
        descs = [dict(type=layer.Dense, size=size, activation=activation) for size
                 in trpo_config.config.hidden_sizes]
        descs.append(dict(type=layer.Dense, size=1))

        value = layer.GenericLayers(ph_state, descs)

        ph_ytarg_ny = graph.Placeholder(np.float32)
        mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node)))

        weights = layer.Weights(value)

        sg_get_weights_flatten = GetVariablesFlatten(weights)
        sg_set_weights_flatten = SetVariablesFlatten(weights)

        l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                           utils.Utils.flatten(weights.node)]))
        loss = graph.TfNode(l2.node + mse.node)

        sg_gradients = optimizer.Gradients(weights, loss=loss)
        sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate)

        self.op_value = self.Op(value, state=ph_state)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state,
                                                     ytarg_ny=ph_ytarg_ny)

        self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
Example #15
0
    def build_graph(self):
        input = layer.ConfiguredInput(config.input)

        hidden = layer.GenericLayers(layer.Flatten(input), [
            dict(type=layer.Dense, size=size, activation=layer.Activation.Tanh)
            for size in config.hidden_sizes
        ])

        weights = [input, hidden]

        if config.dueling_dqn:
            if config.hidden_sizes:
                v_input, a_input = tf.split(hidden.node, [
                    config.hidden_sizes[-1] // 2, config.hidden_sizes[-1] // 2
                ],
                                            axis=1)

                v_input = graph.TfNode(v_input)
                a_input = graph.TfNode(a_input)
            else:
                v_input, a_input = hidden, hidden

            v_output = layer.Dense(v_input, 1)
            a_output = layer.Dense(a_input, config.output.action_size)

            output = v_output.node + a_output.node - tf.reduce_mean(
                a_output.node, axis=1, keep_dims=True)
            output = graph.TfNode(output)

            weights.extend([v_output, a_output])
        else:
            output = layer.Dense(hidden, config.output.action_size)
            weights.append(output)

        self.ph_state = input.ph_state
        self.output = output
        self.weights = layer.Weights(*weights)
Example #16
0
    def build_graph(self):
        conv_layer = dict(type=layer.Convolution, activation=layer.Activation.Elu,
                          n_filters=32, filter_size=[3, 3], stride=[2, 2],
                          border=layer.Border.Same)
        input = layer.Input(cfg.config.input, descs=[dict(conv_layer)] * 4)

        shape = [None] + [cfg.config.output.action_size]
        self.ph_probs = graph.Placeholder(np.float32, shape=shape, name='act_probs')
        self.ph_taken = graph.Placeholder(np.int32, shape=(None,), name='act_taken')

        flattened_input = layer.Flatten(input)
        last_size = flattened_input.node.shape.as_list()[-1]

        inverse_inp = graph.Reshape(input, [-1, last_size*2])

        get_first = graph.TfNode(inverse_inp.node[:, :last_size])
        get_second = graph.TfNode(inverse_inp.node[:, last_size:])

        forward_inp = graph.Concat([get_first, self.ph_probs], axis=1)

        fc_size = cfg.config.hidden_sizes[-1]
        inv_fc1 = layer.Dense(inverse_inp, fc_size, layer.Activation.Relu)
        inv_fc2 = layer.Dense(inv_fc1, shape[-1])   # layer.Activation.Softmax

        fwd_fc1 = layer.Dense(forward_inp, fc_size, layer.Activation.Relu)
        fwd_fc2 = layer.Dense(fwd_fc1, last_size)

        inv_loss = graph.SparseSoftmaxCrossEntropyWithLogits(inv_fc2, self.ph_taken).op
        fwd_loss = graph.L2loss(fwd_fc2.node - get_second.node).op

        self.ph_state = input.ph_state  # should be even wrt to batch_size for now
        self.rew_out = graph.TfNode(cfg.config.icm.nu * fwd_loss)

        self.loss = graph.TfNode(cfg.config.icm.beta * fwd_loss + (1 - cfg.config.icm.beta) * inv_loss)

        layers = [input, inv_fc1, inv_fc2, fwd_fc1, fwd_fc2]
        self.weights = layer.Weights(*layers)
    def build_graph(self):
        # Build graph
        sg_network = Network()
        self.actor = sg_network.actor
        self.critic = sg_network.critic

        sg_loss = loss.DA3CLoss(sg_network.actor.head, sg_network.critic.head,
                                da3c_config.config)
        sg_actor_gradients = optimizer.Gradients(
            sg_network.actor.weights,
            loss=graph.TfNode(sg_loss.policy_loss),
            norm=da3c_config.config.gradients_norm_clipping)
        sg_critic_gradients = optimizer.Gradients(
            sg_network.critic.weights,
            loss=graph.TfNode(sg_loss.value_loss),
            norm=da3c_config.config.gradients_norm_clipping)

        if da3c_config.config.use_icm:
            sg_icm_network = icm_model.ICM()
            sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights,
                                                   loss=sg_icm_network.loss)

            # Expose ICM public API
            self.op_icm_assign_weights = self.Op(
                sg_icm_network.weights.assign,
                weights=sg_icm_network.weights.ph_weights)

            feeds = dict(state=sg_icm_network.ph_state,
                         probs=sg_icm_network.ph_probs)
            self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out,
                                                    **feeds)

            feeds.update(dict(action=sg_icm_network.ph_taken))
            self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate,
                                                    **feeds)

        summaries = tf.summary.merge([
            tf.summary.scalar('policy_loss', sg_loss.policy_loss),
            tf.summary.scalar('value_loss', sg_loss.value_loss),
            tf.summary.scalar('entropy', sg_loss.entropy),
            tf.summary.scalar('actor_gradients_global_norm',
                              sg_actor_gradients.global_norm),
            tf.summary.scalar('critic_gradients_global_norm',
                              sg_critic_gradients.global_norm),
            tf.summary.scalar('actor_weights_global_norm',
                              sg_network.actor.weights.global_norm),
            tf.summary.scalar('critic_weights_global_norm',
                              sg_network.critic.weights.global_norm)
        ])

        # Expose public API
        self.op_assign_weights = self.Ops(
            sg_network.actor.weights.assign,
            sg_network.critic.weights.assign,
            weights=(sg_network.actor.weights.ph_weights,
                     sg_network.critic.weights.ph_weights))

        feeds = dict(state=sg_network.ph_state,
                     action=sg_loss.ph_action,
                     advantage=sg_loss.ph_advantage,
                     discounted_reward=sg_loss.ph_discounted_reward)

        if da3c_config.config.use_lstm:
            feeds.update(
                dict(lstm_state=(sg_network.actor.ph_lstm_state,
                                 sg_network.critic.ph_lstm_state)))
            self.lstm_zero_state = (sg_network.actor.lstm_zero_state,
                                    sg_network.critic.lstm_zero_state)
            self.op_lstm_reset_timestep = self.Ops(
                sg_network.actor.lstm_reset_timestep,
                sg_network.critic.lstm_reset_timestep)
            self.op_get_action_value_and_lstm_state = \
                self.Ops(sg_network.actor.head, sg_network.critic.head,
                         (sg_network.actor.lstm_state, sg_network.critic.lstm_state),
                         state=sg_network.ph_state,
                         lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state))
        else:
            self.op_get_action_and_value = self.Ops(sg_network.actor.head,
                                                    sg_network.critic.head,
                                                    state=sg_network.ph_state)

        self.op_compute_gradients_and_summaries = \
            self.Ops((sg_actor_gradients.calculate, sg_critic_gradients.calculate), summaries, **feeds)
Example #18
0
    def build_graph(self, sg_value_net):
        # 'Observed' value of a state = discounted reward
        vf_scale = dppo_config.config.critic_scale

        ph_ytarg_ny = graph.Placeholder(np.float32)
        v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node))

        if dppo_config.config.vf_clipped_loss:
            ph_old_vpred = graph.Placeholder(np.float32)
            clip_e = dppo_config.config.clip_e
            vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node,
                                                                -clip_e, clip_e)
            v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node))
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node)))
        else:
            vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node))

        if dppo_config.config.l2_coeff is not None:
            l2 = graph.TfNode(dppo_config.config.l2_coeff *
                              tf.add_n([tf.reduce_sum(tf.square(v)) for v in
                                        utils.Utils.flatten(sg_value_net.weights.node)]))

            sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node)
        else:
            sg_vf_total_loss = vf_mse

        sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss,
                                           norm=dppo_config.config.gradients_norm_clipping)
        sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate)

        # Op to compute value of a state
        if dppo_config.config.use_lstm:
            self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state,
                                     state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state)
            self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep)
        else:
            self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state)

        self.op_get_weights = self.Op(sg_value_net.weights)
        self.op_assign_weights = self.Op(sg_value_net.weights.assign,
                                         weights=sg_value_net.weights.ph_weights)

        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny)
        if dppo_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state))
        if dppo_config.config.vf_clipped_loss:
            feeds.update(dict(vpred_old=ph_old_vpred))

        self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds)
        if dppo_config.config.use_lstm:
            self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds)

        self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds)

        losses = [sg_vf_total_loss, vf_mse]
        if dppo_config.config.l2_coeff is not None:
            losses.append(l2)
        self.op_losses = self.Ops(*losses, **feeds)

        # Init Op for all weights
        sg_initialize = graph.Initialize()
        self.op_initialize = self.Op(sg_initialize)
Example #19
0
    def build_graph(self, sg_network):
        if dppo_config.config.use_lstm:
            self.op_get_action = self.Ops(sg_network.head, sg_network.lstm_state,
                                          state=sg_network.ph_state, lstm_state=sg_network.ph_lstm_state)
            self.op_lstm_reset_timestep = self.Op(sg_network.lstm_reset_timestep)
        else:
            self.op_get_action = self.Op(sg_network.head, state=sg_network.ph_state)

        # Advantage node
        ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n'))

        # Contains placeholder for the actual action made by the agent
        sg_probtype = ProbType(dppo_config.config.output.action_size,
                               continuous=dppo_config.config.output.continuous)

        # Placeholder to store action probabilities under the old policy
        ph_oldprob_np = sg_probtype.ProbVariable()

        sg_logp_n = sg_probtype.Loglikelihood(sg_network.head)
        sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np)

        # PPO clipped surrogate loss
        # likelihood ratio of old and new policy
        r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node)
        surr = r_theta * ph_adv_n.node
        clip_e = dppo_config.config.clip_e
        surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node
        sg_pol_clip_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped)))

        # PPO entropy loss
        if dppo_config.config.entropy is not None:
            sg_entropy = sg_probtype.Entropy(sg_network.head)
            sg_ent_loss = (-dppo_config.config.entropy) * tf.reduce_mean(sg_entropy.node)
            sg_pol_total_loss = graph.TfNode(sg_pol_clip_loss.node + sg_ent_loss)
        else:
            sg_pol_total_loss = sg_pol_clip_loss

        # Regular gradients
        sg_ppo_clip_gradients = optimizer.Gradients(sg_network.weights, loss=sg_pol_total_loss,
                                                    norm=dppo_config.config.gradients_norm_clipping)
        feeds = dict(state=sg_network.ph_state, action=sg_probtype.ph_sampled_variable,
                     advantage=ph_adv_n, old_prob=ph_oldprob_np)
        if dppo_config.config.use_lstm:
            feeds.update(dict(lstm_state=sg_network.ph_lstm_state))

        self.op_compute_ppo_clip_gradients = self.Op(sg_ppo_clip_gradients.calculate, **feeds)
        if dppo_config.config.use_lstm:
            self.op_compute_ppo_clip_gradients = self.Ops(sg_ppo_clip_gradients.calculate,
                                                          sg_network.lstm_state, **feeds)

        # Weights get/set for updating the policy
        sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights)
        sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights)

        self.op_get_weights = self.Op(sg_network.weights)
        self.op_assign_weights = self.Op(sg_network.weights.assign,
                                         weights=sg_network.weights.ph_weights)

        self.op_get_weights_flatten = self.Op(sg_get_weights_flatten)
        self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value)

        # Init Op for all weights
        sg_initialize = graph.Initialize()
        self.op_initialize = self.Op(sg_initialize)
Example #20
0
 def build_graph(self, d):
     self._d = d
     self.ph_sampled_variable = graph.TfNode(tf.placeholder(tf.float32, name='a'))
Example #21
0
 def build_graph(self, n):
     self._n = n
     self.ph_sampled_variable = graph.TfNode(tf.placeholder(tf.int32, name='a'))