Example #1
0
class CartPoleTraining:
    """ Training cartpole using cross entropy agoritham based on the code from the book
    'Deep Reinforcement Learning Hands-On'
    """
    Episode = namedtuple('Episode', field_names=['reward', 'steps'])
    EpisodeStep = namedtuple('EpisodeStep',
                             field_names=['observation', 'action'])

    def __init__(self) -> None:
        self.cartpole = CartPole()

    def iterate_batches(self, net, batch_size):
        batch = []
        episode_reward = 0.0
        episode_steps = []

        #start the episode
        self.cartpole.episode_start()
        state = self.cartpole.get_state()

        obs = self.cartpole.state_to_gym(state)

        sm = nn.Softmax(dim=1)
        while True:
            obs_v = torch.FloatTensor([obs])
            act_probs_v = sm(net(obs_v))
            act_probs = act_probs_v.data.numpy()[0]
            action = np.random.choice(len(act_probs), p=act_probs)

            bonsai_action = self.cartpole.gym_to_action(action)

            self.cartpole.episode_step(bonsai_action)

            is_done = self.cartpole.halted()
            reward = self.cartpole.get_last_reward()
            next_obs = self.cartpole.state_to_gym(self.cartpole.get_state())

            episode_reward += reward
            step = self.EpisodeStep(observation=obs, action=action)
            episode_steps.append(step)
            if is_done:
                e = self.Episode(reward=episode_reward, steps=episode_steps)
                batch.append(e)
                episode_reward = 0.0
                episode_steps = []
                self.cartpole.episode_finish("")

                self.cartpole.episode_start()
                state = self.cartpole.get_state()

                next_obs = self.cartpole.state_to_gym(state)

                if len(batch) == batch_size:
                    yield batch
                    batch = []
            obs = next_obs

    def filter_batch(self, batch, percentile):
        rewards = list(map(lambda s: s.reward, batch))
        reward_bound = np.percentile(rewards, percentile)
        reward_mean = float(np.mean(rewards))

        train_obs = []
        train_act = []
        for reward, steps in batch:
            if reward < reward_bound:
                continue
            train_obs.extend(map(lambda step: step.observation, steps))
            train_act.extend(map(lambda step: step.action, steps))

        train_obs_v = torch.FloatTensor(train_obs)
        train_act_v = torch.LongTensor(train_act)
        return train_obs_v, train_act_v, reward_bound, reward_mean

    def train(self):
        obs_size = self.cartpole._env.unwrapped.observation_space.shape[0]
        n_actions = self.cartpole._env.unwrapped.action_space.n

        net = Net(obs_size, HIDDEN_SIZE, n_actions)
        objective = nn.CrossEntropyLoss()
        optimizer = optim.Adam(params=net.parameters(), lr=0.01)
        writer = SummaryWriter(comment="-cartpole")

        for iter_no, batch in enumerate(self.iterate_batches(net, BATCH_SIZE)):
            obs_v, acts_v, reward_b, reward_m = self.filter_batch(
                batch, PERCENTILE)
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()

            #env.render()

            print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" %
                  (iter_no, loss_v.item(), reward_m, reward_b))
            writer.add_scalar("loss", loss_v.item(), iter_no)
            writer.add_scalar("reward_bound", reward_b, iter_no)
            writer.add_scalar("reward_mean", reward_m, iter_no)
            if reward_m > 199:
                print("Solved!")
                break
        writer.close()
Example #2
0
    episode_count = 100

    try:
        for i in range(episode_count):
            #start a new episode and get the new state
            cartpole.episode_start()
            state = cartpole.get_state()
            cum_reward = 0

            while True:
                #get the action from the agent (based on the current state)
                action = agent.act(state)

                #do the next step of the simulation and get the new state
                cartpole.episode_step(action)
                state = cartpole.get_state()

                #get the last reward and add it the episode reward 
                reward = cartpole.get_last_reward()
                cum_reward += reward

                if cartpole.halted():
                    writer.add_scalar("reward", cum_reward, i )
                    break
                writer.flush()
            cartpole.episode_finish("")

        writer.close()    
    except KeyboardInterrupt:
        print("Stopped")