def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()
        self.next_q_program = fluid.Program()
        self.next_a_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs',
                shape=[self.obs_dim_n[self.agent_index]],
                dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs_n = [
                layers.data(
                    name='obs' + str(i),
                    shape=[self.obs_dim_n[i]],
                    dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(
                    name='act' + str(i),
                    shape=[self.act_dim_n[i]],
                    dtype='float32') for i in range(self.n)
            ]
            target_q = layers.data(name='target_q', shape=[], dtype='float32')
            self.critic_cost = self.alg.learn(obs_n, act_n, target_q)

        with fluid.program_guard(self.next_q_program):
            obs_n = [
                layers.data(
                    name='obs' + str(i),
                    shape=[self.obs_dim_n[i]],
                    dtype='float32') for i in range(self.n)
            ]
            act_n = [
                layers.data(
                    name='act' + str(i),
                    shape=[self.act_dim_n[i]],
                    dtype='float32') for i in range(self.n)
            ]
            self.next_Q = self.alg.Q_next(obs_n, act_n)

        with fluid.program_guard(self.next_a_program):
            obs = layers.data(
                name='obs',
                shape=[self.obs_dim_n[self.agent_index]],
                dtype='float32')
            self.next_action = self.alg.predict_next(obs)

        if self.speedup:
            self.pred_program = parl.compile(self.pred_program)
            self.learn_program = parl.compile(self.learn_program,
                                              self.critic_cost)
            self.next_q_program = parl.compile(self.next_q_program)
            self.next_a_program = parl.compile(self.next_a_program)
Beispiel #2
0
    def test_compiled_restore(self):
        agent = TestAgent(self.alg)
        agent.learn_program = parl.compile(agent.learn_program)
        obs = np.random.random([3, 10]).astype('float32')
        previous_output = agent.predict(obs)
        save_path1 = 'model.ckpt'
        agent.save(save_path1)
        agent.restore(save_path1)

        # a new agent instance
        another_agent = TestAgent(self.alg)
        another_agent.learn_program = parl.compile(another_agent.learn_program)
        another_agent.restore(save_path1)
        current_output = another_agent.predict(obs)
        np.testing.assert_equal(current_output, previous_output)
Beispiel #3
0
    def build_program(self):
        self.predict_program = fluid.Program()

        with fluid.program_guard(self.predict_program):
            obs = layers.data(name='obs',
                              shape=[self.config['obs_dim']],
                              dtype='float32')
            self.predict_action = self.alg.predict(obs)
        self.predict_program = parl.compile(self.predict_program)
Beispiel #4
0
    def build_program(self):
        self.sample_program = fluid.Program()
        self.predict_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.sample_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            self.sample_actions, self.behaviour_logits = self.alg.sample(obs)

        with fluid.program_guard(self.predict_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            self.predict_actions = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            actions = layers.data(name='actions', shape=[], dtype='int64')
            behaviour_logits = layers.data(name='behaviour_logits',
                                           shape=[self.act_dim],
                                           dtype='float32')
            rewards = layers.data(name='rewards', shape=[], dtype='float32')
            dones = layers.data(name='dones', shape=[], dtype='float32')
            lr = layers.data(name='lr',
                             shape=[1],
                             dtype='float32',
                             append_batch_size=False)
            entropy_coeff = layers.data(name='entropy_coeff',
                                        shape=[1],
                                        dtype='float32',
                                        append_batch_size=False)

            self.learn_reader = fluid.layers.create_py_reader_by_data(
                capacity=32,
                feed_list=[
                    obs, actions, behaviour_logits, rewards, dones, lr,
                    entropy_coeff
                ])

            obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff = fluid.layers.read_file(
                self.learn_reader)

            vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits,
                                             rewards, dones, lr, entropy_coeff)
            self.learn_outputs = [
                vtrace_loss.total_loss, vtrace_loss.pi_loss,
                vtrace_loss.vf_loss, vtrace_loss.entropy, kl
            ]
        self.learn_program = parl.compile(self.learn_program,
                                          vtrace_loss.total_loss)
Beispiel #5
0
    def build_program(self):
        self.sample_program = fluid.Program()
        self.predict_program = fluid.Program()
        self.value_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.sample_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            sample_actions, values = self.alg.sample(obs)
            self.sample_outputs = [sample_actions, values]

        with fluid.program_guard(self.predict_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            self.predict_actions = self.alg.predict(obs)

        with fluid.program_guard(self.value_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            self.values = self.alg.value(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(name='obs',
                              shape=self.obs_shape,
                              dtype='float32')
            actions = layers.data(name='actions', shape=[], dtype='int64')
            advantages = layers.data(name='advantages',
                                     shape=[],
                                     dtype='float32')
            target_values = layers.data(name='target_values',
                                        shape=[],
                                        dtype='float32')
            lr = layers.data(name='lr',
                             shape=[1],
                             dtype='float32',
                             append_batch_size=False)
            entropy_coeff = layers.data(name='entropy_coeff',
                                        shape=[1],
                                        dtype='float32',
                                        append_batch_size=False)

            total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
                obs, actions, advantages, target_values, lr, entropy_coeff)
            self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
        self.learn_program = parl.compile(self.learn_program, total_loss)
Beispiel #6
0
    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()
        self.supervised_eval_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs',
                shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]],
                dtype='float32')
            self.value = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(
                name='obs',
                shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]],
                dtype='float32')
            action = layers.data(name='act', shape=[1], dtype='int32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs',
                shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]],
                dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)

        # use parl.compile to distribute data and model to GPUs
        self.learn_program = parl.compile(self.learn_program, loss=self.cost)

        with fluid.program_guard(self.supervised_eval_program):
            obs = layers.data(
                name='obs',
                shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]],
                dtype='float32')
            action = layers.data(name='act', shape=[1], dtype='int32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs',
                shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]],
                dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            self.supervised_cost = self.alg.supervised_eval(
                obs, action, reward, next_obs, terminal)