Ejemplo n.º 1
0
    def evaluate(self, max_action=50, max_episode=12):
        total_steps = 0
        self.callbacks.on_agent_begin()
        for episode_number in easy_range(1, max_episode):
            episode_reward = 0
            state = self.environment.reset()
            state = self.processor.process_state(state)
            self.callbacks.on_episode_begin(**{
                'episode_number': episode_number,
                'state': state
            })

            goal_state = self.final_state()

            for action_number in easy_range(1, max_action):
                action = self.get_action(state, goal_state)
                self.callbacks.on_action_begin(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action
                })

                if hasattr(self.environment, 'get_step'):
                    step = self.environment.get_step(action, 'continuous')
                else:
                    step = action
                next_state, reward, terminal, _ = self.environment.step(step)
                next_state = self.processor.process_state(next_state)
                if action_number >= max_action:
                    terminal = True

                self.callbacks.on_action_end(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action,
                    'reward': reward,
                    'terminal': terminal,
                    'next_state': next_state
                })

                episode_reward += reward
                state = deepcopy(next_state)
                total_steps += 1

                if terminal:
                    self.callbacks.on_episode_end(**{
                        'episode_number': episode_number,
                        'action_number': action_number,
                        'episode_reward': episode_reward
                    })

                    gc.collect()
                    break

        self.environment.close()
        self.callbacks.on_agent_end(**{
            'total_steps': total_steps
        })
Ejemplo n.º 2
0
    def train(self, batch_size=32, max_action=200, max_episode=12000, warmup=120000):
        total_steps = 0
        self.callbacks.on_agent_begin(**{
            'agent_headers': ['episode_number', 'action_number', 'episode_reward'],
            'network_headers': ['loss']
        })
        for episode_number in easy_range(1, max_episode):
            episode_reward = 0
            state = self.environment.reset()
            state = self.processor.process_state(state)
            self.callbacks.on_episode_begin(**{
                'episode_number': episode_number,
                'state': state
            })

            for action_number in easy_range(1, max_action):
                action = self.get_action(state, self.train_policy)
                self.callbacks.on_action_begin(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action
                })
                step = self.get_step(action)
                next_state, reward, terminal, _ = self.environment.step(step)
                if not terminal:
                    _, r, _, _ = self.environment.step(random.choice(list(self.environment.generate_legal_moves())))
                    reward -= r
                next_state = self.processor.process_state(next_state)
                if action_number >= max_action:
                    terminal = True

                self.callbacks.on_action_end(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action,
                    'reward': reward,
                    'terminal': terminal,
                    'next_state': next_state
                })

                # clipped_reward = np.clip(reward - 0.25, -1, 1)
                self.memory.remember((state, action, reward, next_state, terminal))

                if total_steps > warmup:
                    self.train_policy.decay()
                    if total_steps % batch_size == 0:
                        self.callbacks.on_replay_begin()
                        mini_batch = self.memory.sample()
                        batch = self.processor.process_batch(mini_batch)
                        loss = self.model.train(batch)

                        self.callbacks.on_replay_end(**{
                            'loss': loss
                        })

                episode_reward += reward
                state = copy.deepcopy(next_state)
                total_steps += 1

                if terminal or self.environment.is_game_over():
                    self.callbacks.on_episode_end(**{
                        'episode_number': episode_number,
                        'action_number': action_number,
                        'episode_reward': episode_reward
                    })
                    gc.collect()
                    break

        self.environment.close()
        self.callbacks.on_agent_end(**{
            'total_steps': total_steps
        })
Ejemplo n.º 3
0
    def train(self, batch_size=32, max_action=50, max_episode=120, warmup=0, replay_interval=4, update_interval=1, test_interval=1000):
        total_steps = 0
        self.callbacks.on_agent_begin(**{
            'agent_headers': ['episode_number', 'action_number', 'episode_reward'],
            'network_headers': ['actor_loss', 'critic_loss', 'critic_extra_loss']
        })
        for episode_number in easy_range(1, max_episode):
            episode_reward = 0
            state = self.environment.reset()
            state = self.processor.process_state(state)
            self.callbacks.on_episode_begin(**{
                'episode_number': episode_number,
                'state': state
            })

            goal_state = self.final_state()

            for action_number in easy_range(1, max_action):
                action = self.get_action(state, goal_state)
                self.callbacks.on_action_begin(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action
                })

                if hasattr(self.environment, 'get_step'):
                    step = self.environment.get_step(action, 'continuous')
                else:
                    step = action
                next_state, reward, terminal, _ = self.environment.step(step)
                next_state = self.processor.process_state(next_state)
                if action_number >= max_action:
                    terminal = True

                self.callbacks.on_action_end(**{
                    'episode_number': episode_number,
                    'action_number': action_number,
                    'state': state,
                    'action': action,
                    'reward': reward,
                    'terminal': terminal,
                    'next_state': next_state
                })

                processed_state = np.concatenate((state, goal_state), axis=2) if self.her else state
                clipped_reward = np.clip(reward - 0.25, -1, 1)
                processed_next_state = np.concatenate((next_state, goal_state), axis=2) if self.her else next_state
                self.memory.remember(processed_state, action, clipped_reward, processed_next_state, terminal)

                if total_steps > warmup:
                    self.random_process.decay()
                    if total_steps % replay_interval == 0:
                        self.callbacks.on_replay_begin()
                        mini_batch = self.memory.sample()
                        batch = self.processor.process_batch(mini_batch)
                        loss = self.model.train(batch, ((total_steps - warmup) // replay_interval) % update_interval == 0)
                        self.callbacks.on_replay_end(**{
                            'loss': loss
                        })

                episode_reward += reward
                state = deepcopy(next_state)
                total_steps += 1

                if terminal:
                    self.callbacks.on_episode_end(**{
                        'episode_number': episode_number,
                        'action_number': action_number,
                        'episode_reward': episode_reward
                    })
                    gc.collect()
                    break

        self.environment.close()
        self.callbacks.on_agent_end(**{
            'total_steps': total_steps
        })
Ejemplo n.º 4
0
    def train(self,
              batch_size=32,
              max_action=200,
              max_episode=12000,
              warmup=120000):
        total_steps = 0
        self.callbacks.on_agent_begin(
            **{
                'agent_headers':
                ['episode_number', 'action_number', 'episode_reward'],
                'network_headers': ['loss']
            })
        for episode_number in easy_range(1, max_episode):
            episode_reward = 0
            state = self.environment.reset()
            state = self.processor.process_state(state)
            self.callbacks.on_episode_begin(**{
                'episode_number': episode_number,
                'state': state
            })

            goal_state = self.final_state()

            for action_number in easy_range(1, max_action):
                action = self.get_action(state, goal_state, self.train_policy)
                self.callbacks.on_action_begin(
                    **{
                        'episode_number': episode_number,
                        'action_number': action_number,
                        'state': state,
                        'action': action
                    })

                step = self.get_step(action, 'discrete', self.action_number)
                next_state, reward, terminal, _ = self.environment.step(step)
                next_state = self.processor.process_state(next_state)
                if action_number >= max_action:
                    terminal = True

                self.callbacks.on_action_end(
                    **{
                        'episode_number': episode_number,
                        'action_number': action_number,
                        'state': state,
                        'action': action,
                        'reward': reward,
                        'terminal': terminal,
                        'next_state': next_state
                    })

                processed_state = np.concatenate(
                    (state, goal_state), axis=2) if self.her else state
                clipped_reward = np.clip(reward - 0.25, -1, 1)
                processed_next_state = np.concatenate(
                    (next_state,
                     goal_state), axis=2) if self.her else next_state
                self.memory.remember((processed_state, action, clipped_reward,
                                      processed_next_state, terminal))

                if total_steps > warmup:
                    self.train_policy.decay()
                    if total_steps % batch_size == 0:
                        self.callbacks.on_replay_begin()
                        mini_batch = self.memory.sample()
                        batch = self.processor.process_batch(mini_batch)
                        loss = self.model.train(batch)
                        self.callbacks.on_replay_end(**{'loss': loss})

                episode_reward += reward
                state = copy.deepcopy(next_state)
                total_steps += 1

                if terminal:
                    self.callbacks.on_episode_end(
                        **{
                            'episode_number': episode_number,
                            'action_number': action_number,
                            'episode_reward': episode_reward
                        })
                    gc.collect()
                    break

        self.environment.close()
        self.callbacks.on_agent_end(**{'total_steps': total_steps})