Esempio n. 1
0
    def train_controller(self, batch):
        expected_rewards = []
        for i, episode_rewards in enumerate(batch['rewards']):
            expected_rewards.append(get_expected_rewards(episode_rewards, self.discount))

        # Fit Critic
        av_critic_loss = []
        for i in range(self.nb_critic_iter):
            _, critic_loss = self.sess.run([self.train_critic_op, self.critic_loss], feed_dict={
                self.inputs: batch['states']
                , self.actions: batch['actions']
                , self.expected_rewards: expected_rewards
                , self.rewards: batch['rewards']
                , self.mask_plh: batch['mask']
                , self.next_states: batch['next_states']
            })
            av_critic_loss.append(critic_loss)
        self.sess.run(self.update_fixed_vars_op)

        _, policy_loss = self.sess.run([self.train_policy_op, self.policy_loss], feed_dict={
            self.inputs: batch['states']
            , self.actions: batch['actions']
            , self.expected_rewards: expected_rewards
            , self.rewards: batch['rewards']
            , self.mask_plh: batch['mask']
            , self.next_states: batch['next_states']
        })
            
        summary, _, episode_id = self.sess.run([self.all_summary_t, self.inc_ep_id_op, self.episode_id], feed_dict={
            self.policy_loss_plh: policy_loss,
            self.critic_loss_plh: np.mean(av_critic_loss),
        })
        self.sw.add_summary(summary, episode_id)

        return
Esempio n. 2
0
    def learn_from_episode(self, env, render):
        obs = env.reset()
        act, _ = self.act(obs)

        score = 0
        historyType = np.dtype([
            ('states', 'float32', (env.observation_space.shape[0] + 1, )),
            ('actions', 'int32', (1, )),
            ('rewards', 'float32'),
            ('next_states', 'float32', (env.observation_space.shape[0] + 1, )),
            ('next_actions', 'int32'),
        ])
        history = np.array([], dtype=historyType)
        done = False

        while True:
            if render:
                env.render()

            next_obs, reward, done, info = env.step(act)
            next_act, _ = self.act(next_obs)

            memory = np.array(
                [(np.concatenate((obs, [0])), [act], reward,
                  np.concatenate((next_obs, [1 if done else 0])), next_act)],
                dtype=historyType)
            history = np.append(history, memory)

            score += reward
            obs = next_obs
            act = next_act
            if done:
                break

        # Learning
        _, policy_loss, q_loss, loss = self.sess.run(
            [self.train_op, self.policy_loss, self.q_loss, self.loss],
            feed_dict={
                self.inputs: history['states'],
                self.actions: history['actions'],
                self.rewards: get_expected_rewards(history['rewards']),
                self.next_states: history['next_states'],
                self.next_actions: history['next_actions'],
            })
        summary, _, episode_id = self.sess.run(
            [self.all_summary_t, self.inc_ep_id_op, self.episode_id],
            feed_dict={
                self.score_plh: score,
                self.policy_loss_plh: policy_loss,
                self.q_loss_plh: q_loss,
                self.loss_plh: loss,
            })
        self.sw.add_summary(summary, episode_id)

        return
    def learn_from_episode(self, env, render=False):
        t = 0
        score = 0
        av_loss = []
        historyType = np.dtype([('states', 'int32', ()), ('actions', 'int32', ()), ('rewards', 'float32'), ('estimates', 'float32')])
        history = np.array([], dtype=historyType)
        done = False

        obs = env.reset()
        act, state_id, estimate = self.act(obs)
        while not done:
            if render:
                env.render()

            next_obs, reward, done, info = env.step(act)
            next_act, next_state_id, next_estimate = self.act(next_obs, done)

            memory = np.array([(state_id, act, reward, next_estimate)], dtype=historyType)
            history = np.append(history, memory)
            if t >= self.n_step - 1:
                # In this case, it is a lot faster to use Python directly to compute the targets
                targets = capacities.get_n_step_expected_rewards(history['rewards'][- self.n_step:], history['estimates'][- self.n_step:], self.discount, self.n_step)
                _, loss = self.sess.run([self.train_op, self.loss], feed_dict={
                    self.inputs_plh: [ history['states'][- self.n_step] ],
                    self.actions_t: [ history['actions'][- self.n_step] ],
                    self.targets_t: [ targets[0] ],
                })
                av_loss.append(loss)

            t += 1
            score += reward
            obs = next_obs
            state_id = next_state_id
            act = next_act

        # We now have to finish the learning
        if self.n_step - 1 > 0:
            min_step = min(self.n_step, len(history))
            targets = capacities.get_expected_rewards(history['rewards'][- min_step:], self.discount)
            _, loss = self.sess.run([self.train_op, self.loss], feed_dict={
                self.inputs_plh: history['states'][-min_step:],
                self.actions_t: history['actions'][-min_step:],
                self.targets_t: targets,
            })
            av_loss.append(loss)

        summary, _, episode_id = self.sess.run([self.all_summary_t, self.inc_ep_id_op, self.episode_id], feed_dict={
            self.score_plh: score,
            self.loss_plh: np.mean(av_loss),
        })
        self.sw.add_summary(summary, episode_id)
Esempio n. 4
0
    def train_controller(self, batch):
        for i, episode_rewards in enumerate(batch['rewards']):
            batch['rewards'][i] = get_expected_rewards(episode_rewards, self.discount)

        _, loss = self.sess.run([self.train_op, self.loss], feed_dict={
            self.inputs: batch['states']
            , self.actions: batch['actions']
            , self.rewards: batch['rewards']
            , self.mask_plh: batch['mask'] 
        })
            
        summary, episode_id = self.sess.run([self.loss_sum_t, self.episode_id], feed_dict={
            self.loss_plh: np.mean(loss),
        })
        self.sw.add_summary(summary, episode_id)

        return
Esempio n. 5
0
    def train_controller(self, batch):
        # print('Training controller')
        for i, episode_rewards in enumerate(batch['rewards']):
            batch['rewards'][i] = get_expected_rewards(episode_rewards,
                                                       self.discount)

        _, c_sum, time, _ = self.sess.run(
            [
                self.c_train_op, self.all_c_summary_t, self.time,
                self.inc_time_op
            ],
            feed_dict={
                self.state_input_plh: batch['states'],
                self.actions_t: batch['actions'],
                self.c_rewards_plh: batch['rewards'],
                self.mask_plh: batch['mask']
            })

        self.sw.add_summary(c_sum, time)

        return