Python ExperienceReplay Examples

Programming Language: Python

Namespace/Package Name: snakeai.utils.memory

Class/Type: ExperienceReplay

Examples at hotexamples.com: 8

Python ExperienceReplay - 8 examples found. These are the top rated real world Python examples of snakeai.utils.memory.ExperienceReplay extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ExperienceReplay(4)

get_batch(2)

remember(2)

get_multi_batch(1)

multi_remember(1)

remember_prioritized_ratio(1)

Example #1

Show file

File: minimaxdqn.py Project: gamebird96/Multi-snake-game-using-MARL

    def __init__(self, model_1, model_2, num_last_frames=4, memory_size=1000):
        """
        Create a new DQN-based agent.

        Args:
            model_1: a compiled DQN model for snake 1.
            model_2: a compiled DQN model for snake 2.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited).
        """
        assert model_1.input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model_1.output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'
        assert model_2.input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model_2.output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model_1 = model_1
        self.model_2 = model_2
        self.num_last_frames = num_last_frames
        self.memory_1 = ExperienceReplay(
            (num_last_frames, ) + model_1.input_shape[-2:],
            model_1.output_shape[-1] // 3, memory_size)
        self.memory_2 = ExperienceReplay(
            (num_last_frames, ) + model_2.input_shape[-2:],
            model_2.output_shape[-1] // 3, memory_size)
        self.frames = None

Example #2

Show file

    def __init__(self, model, num_last_frames=4, memory_size=1000, output="."):
        """
        Create a new DQN-based agent.
        
        Args:
            model: a compiled DQN model.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited). 
            output (str): folder path to output model files.
        """
        assert model[0].input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model[0].output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model = model
        self.num_last_frames = num_last_frames
        self.memory = ExperienceReplay(
            (num_last_frames, ) + model[0].input_shape[-2:],
            model[0].output_shape[-1], memory_size)
        self.frames = None
        self.output = output
        self.num_frames = 0
        self.num_trained_frames = 0

Example #3

Show file

File: dqn.py Project: exe1023/VIN-snake

    def __init__(self, model, num_last_frames=4, memory_size=1000, attention=1):
        """
        Create a new DQN-based agent.
        
        Args:
            model: a compiled DQN model.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited). 
        """
        #assert model.input_shape[0][1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        #assert len(model.output_shape) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model = model
        self.num_last_frames = num_last_frames
        if attention != -1:
            self.memory = ExperienceReplay((num_last_frames,) + model.input_shape[0][-2:], model.output_shape[-1], memory_size, attention)
        else:
            self.memory = ExperienceReplay((num_last_frames,) + model.input_shape[-2:], model.output_shape[-1], memory_size, attention)
        self.frames = None
        self.attention = attention

Example #4

Show file

    def __init__(self,
                 model,
                 env_shape,
                 num_actions,
                 num_last_frames=4,
                 memory_size=1000):
        """
        Create a new DQN-based agent.

        Args:
            model: a DQN model.
            env_shape (int, int): shape of the environment.
            num_actions (int): number of actions.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited).
        """
        self.model = model
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.001)

        self.num_last_frames = num_last_frames
        self.memory = ExperienceReplay((num_last_frames, ) + env_shape,
                                       num_actions, memory_size)
        self.frames = None

Example #5

Show file

File: minimaxdqn.py Project: gamebird96/Multi-snake-game-using-MARL

class MinimaxDeepQNetworkAgent(AgentBase):
    def __init__(self, model_1, model_2, num_last_frames=4, memory_size=1000):
        """
        Create a new DQN-based agent.

        Args:
            model_1: a compiled DQN model for snake 1.
            model_2: a compiled DQN model for snake 2.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited).
        """
        assert model_1.input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model_1.output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'
        assert model_2.input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model_2.output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model_1 = model_1
        self.model_2 = model_2
        self.num_last_frames = num_last_frames
        self.memory_1 = ExperienceReplay(
            (num_last_frames, ) + model_1.input_shape[-2:],
            model_1.output_shape[-1] // 3, memory_size)
        self.memory_2 = ExperienceReplay(
            (num_last_frames, ) + model_2.input_shape[-2:],
            model_2.output_shape[-1] // 3, memory_size)
        self.frames = None

    def begin_episode(self):
        """ Reset the agent for a new episode. """
        self.frames = None

    def get_last_frames(self, observation):
        """
        Get the pixels of the last `num_last_frames` observations, the current frame being the last.

        Args:
            observation: observation at the current timestep.

        Returns:
            Observations for the last `num_last_frames` frames.
        """
        frame = observation
        if self.frames is None:
            self.frames = collections.deque([frame] * self.num_last_frames)
        else:
            self.frames.append(frame)
            self.frames.popleft()
        return np.expand_dims(self.frames, 0).astype(np.float32) / 16

    def train(self,
              env,
              num_episodes=1000,
              batch_size=50,
              discount_factor=0.9,
              checkpoint_freq=None,
              exploration_range=(1.0, 0.1),
              exploration_phase_size=0.5):
        """
        Train the agent to perform well in the given Snake environment.

        Args:
            env:
                an instance of Snake environment.
            num_episodes (int):
                the number of episodes to run during the training.
            batch_size (int):
                the size of the learning sample for experience replay.
            discount_factor (float):
                discount factor (gamma) for computing the value function.
            checkpoint_freq (int):
                the number of episodes after which a new model checkpoint will be created.
            exploration_range (tuple):
                a (max, min) range specifying how the exploration rate should decay over time.
            exploration_phase_size (float):
                the percentage of the training process at which
                the exploration rate should reach its minimum.
        """

        # Calculate the constant exploration decay speed for each episode.
        max_exploration_rate, min_exploration_rate = exploration_range
        exploration_decay = ((max_exploration_rate - min_exploration_rate) /
                             (num_episodes * exploration_phase_size))
        exploration_rate = max_exploration_rate

        for episode in range(num_episodes):
            # Reset the environment for the new episode.
            timestep = env.new_episode()
            self.begin_episode()
            game_over = False
            loss_1 = 0.0
            loss_2 = 0.0
            alive_1 = True
            alive_2 = True
            # Observe the initial state.
            state = self.get_last_frames(timestep.observation)

            while not game_over:
                if np.random.random() < exploration_rate:
                    # Explore: take a random action.
                    action = (np.random.randint(env.num_actions),
                              np.random.randint(env.num_actions))
                else:
                    # Exploit: take the best known action for this state.
                    q1 = self.model_1.predict(state)
                    q2 = self.model_2.predict(state)
                    q1 = q1.reshape((env.num_actions, env.num_actions))
                    q2 = q2.reshape((env.num_actions, env.num_actions))
                    if alive_1 and alive_2:
                        action = (np.argmax(np.min(q1, axis=1)),
                                  np.argmax(np.min(q2, axis=1)))
                    elif alive_1:
                        action = (np.argmax(np.min(q1, axis=1)),
                                  np.argmin(np.max(q1, axis=0)))
                    elif alive_2:
                        action = (np.argmin(np.max(q2, axis=0)),
                                  np.argmax(np.min(q2, axis=1)))

                # Act on the environment.
                env.choose_action(action)
                timestep = env.timestep()

                # Remember a new piece of experience.
                reward_1, reward_2 = timestep.reward_1, timestep.reward_2
                state_next = self.get_last_frames(timestep.observation)
                game_over = timestep.is_episode_end

                experience_item_1 = [
                    state, action[0], action[1], reward_1, state_next,
                    game_over
                ]
                experience_item_2 = [
                    state, action[1], action[0], reward_2, state_next,
                    game_over
                ]
                self.memory_1.multi_remember(*experience_item_1)
                self.memory_2.multi_remember(*experience_item_2)
                state = state_next

                # Sample a random batch from experience.

                if alive_1:
                    batch = self.memory_1.get_multi_batch(
                        model=self.model_1,
                        batch_size=batch_size,
                        discount_factor=discount_factor)
                    # Learn on the batch.
                    if batch:
                        inputs, targets = batch
                        loss_1 += float(
                            self.model_1.train_on_batch(inputs, targets))
                    # Sample a random batch from experience.

                if alive_2:
                    batch = self.memory_2.get_multi_batch(
                        model=self.model_2,
                        batch_size=batch_size,
                        discount_factor=discount_factor)
                    # Learn on the batch.
                    if batch:
                        inputs, targets = batch
                        loss_2 += float(
                            self.model_2.train_on_batch(inputs, targets))

                alive_1 = timestep.alive_1
                alive_2 = timestep.alive_2

            if checkpoint_freq and (episode % checkpoint_freq) == 0:
                self.model_1.save(f'dqn-mm1-{episode:08d}.model')
                self.model_2.save(f'dqn-mm2-{episode:08d}.model')

            if exploration_rate > min_exploration_rate:
                exploration_rate -= exploration_decay

            summary = 'Episode {:5d}/{:5d} | Loss {:8.4f}, {:8.4f} | Exploration {:.2f} | ' + \
                      'Fruits {:2d}, {:2d} | Timesteps {:4d} | Total Reward {:4d}, {:4d}'
            print(
                summary.format(episode + 1, num_episodes, loss_1, loss_2,
                               exploration_rate, env.stats.fruits_eaten_1,
                               env.stats.fruits_eaten_2,
                               env.stats.timesteps_survived,
                               env.stats.sum_episode_rewards_1,
                               env.stats.sum_episode_rewards_2))

        self.model_1.save('dqn-mm1-final.model')
        self.model_2.save('dqn-mm2-final.model')

    def act(self, observation, reward, alive_1=True, alive_2=True):
        """
        Choose the next action to take.

        Args:
            observation: observable state for the current timestep.
            reward: reward received at the beginning of the current timestep.

        Returns:
            The index of the action to take next.
        """
        state = self.get_last_frames(observation)
        q1 = self.model_1.predict(state).reshape(3, 3)
        q2 = self.model_2.predict(state).reshape(3, 3)

        if alive_1 and alive_2:
            return (np.argmax(np.min(q1, axis=1)), np.argmax(np.min(q2,
                                                                    axis=1)))
        elif alive_1:
            return (np.argmax(np.min(q1, axis=1)), np.argmin(np.max(q1,
                                                                    axis=0)))
        elif alive_2:
            return (np.argmin(np.max(q2, axis=1)), np.argmax(np.min(q2,
                                                                    axis=0)))

Example #6

Show file

File: dqn.py Project: exe1023/VIN-snake

class DeepQNetworkAgent(AgentBase):
    """ Represents a Snake agent powered by DQN with experience replay. """

    def __init__(self, model, num_last_frames=4, memory_size=1000, attention=1):
        """
        Create a new DQN-based agent.
        
        Args:
            model: a compiled DQN model.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited). 
        """
        #assert model.input_shape[0][1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        #assert len(model.output_shape) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model = model
        self.num_last_frames = num_last_frames
        if attention != -1:
            self.memory = ExperienceReplay((num_last_frames,) + model.input_shape[0][-2:], model.output_shape[-1], memory_size, attention)
        else:
            self.memory = ExperienceReplay((num_last_frames,) + model.input_shape[-2:], model.output_shape[-1], memory_size, attention)
        self.frames = None
        self.attention = attention

    def begin_episode(self):
        """ Reset the agent for a new episode. """
        self.frames = None

    def get_last_frames(self, observation):
        """
        Get the pixels of the last `num_last_frames` observations, the current frame being the last.
        
        Args:
            observation: observation at the current timestep. 

        Returns:
            Observations for the last `num_last_frames` frames.
        """
        frame = observation
        if self.frames is None:
            self.frames = collections.deque([frame] * self.num_last_frames)
        else:
            self.frames.append(frame)
            self.frames.popleft()
        return np.expand_dims(self.frames, 0)

    def train(self, env, num_episodes=1000, batch_size=50, discount_factor=0.9, checkpoint_freq=None,
              exploration_range=(1.0, 0.1), exploration_phase_size=0.5):
        """
        Train the agent to perform well in the given Snake environment.
        
        Args:
            env:
                an instance of Snake environment.
            num_episodes (int):
                the number of episodes to run during the training.
            batch_size (int):
                the size of the learning sample for experience replay.
            discount_factor (float):
                discount factor (gamma) for computing the value function.
            checkpoint_freq (int):
                the number of episodes after which a new model checkpoint will be created.
            exploration_range (tuple):
                a (max, min) range specifying how the exploration rate should decay over time. 
            exploration_phase_size (float):
                the percentage of the training process at which
                the exploration rate should reach its minimum.
        """

        # Calculate the constant exploration decay speed for each episode.
        max_exploration_rate, min_exploration_rate = exploration_range
        exploration_decay = ((max_exploration_rate - min_exploration_rate) / (num_episodes * exploration_phase_size))
        exploration_rate = max_exploration_rate

        for episode in range(num_episodes):
            # Reset the environment for the new episode.
            timestep, position = env.new_episode()
            self.begin_episode()
            game_over = False
            loss = 0.0

            # Observe the initial state.
            state = self.get_last_frames(timestep.observation)

            while not game_over:
                if np.random.random() < exploration_rate:
                    # Explore: take a random action.
                    action = np.random.randint(env.num_actions)
                else:
                    # Exploit: take the best known action for this state.
                    s = np.array([(position[0], position[1])])
                    if self.attention == 0:
                        q = self.model.predict([state, state, s])
                    elif self.attention > 0:
                        q = self.model.predict([state, state])
                    else:
                        q = self.model.predict(state)
                    action = np.argmax(q[0])

                # Act on the environment.
                env.choose_action(action)
                timestep, position_next = env.timestep()

                # Remember a new piece of experience.
                reward = timestep.reward
                state_next = self.get_last_frames(timestep.observation)
                game_over = timestep.is_episode_end
                experience_item = [state, position, action, reward, state_next, position_next, game_over]
                self.memory.remember(*experience_item)
                state = state_next
                position = position_next

                # Sample a random batch from experience.
                batch = self.memory.get_batch(
                    model=self.model,
                    batch_size=batch_size,
                    discount_factor=discount_factor
                )
                # Learn on the batch.
                if batch:
                    inputs, s, targets = batch
                    #print(episode)
                    #print(inputs)
                    #print(targets)
                    if self.attention == 0:
                        loss += float(self.model.train_on_batch([inputs, inputs, s], targets))
                    elif self.attention > 0:
                        loss += float(self.model.train_on_batch([inputs, inputs], targets))
                    else:
                        loss += float(self.model.train_on_batch(inputs, targets))


            if checkpoint_freq and (episode % checkpoint_freq) == 0:
                #self.model.save(f'dqn-{episode:08d}.model')
                self.model.save('dqn-' + str(episode) + '.model')

            if exploration_rate > min_exploration_rate:
                exploration_rate -= exploration_decay

            summary = 'Episode {:5d}/{:5d} | Loss {:8.4f} | Exploration {:.2f} | ' + \
                      'Fruits {:2d} | Timesteps {:4d} | Total Reward {:4d}'
            print(summary.format(
                episode + 1, num_episodes, loss, exploration_rate,
                env.stats.fruits_eaten, env.stats.timesteps_survived, env.stats.sum_episode_rewards,
            ))
            print('Episode')

        self.model.save('dqn-final.model')

    def act(self, observation, position, reward, attention=1):
        """
        Choose the next action to take.
        
        Args:
            observation: observable state for the current timestep. 
            reward: reward received at the beginning of the current timestep.

        Returns:
            The index of the action to take next.
        """
        state = self.get_last_frames(observation)
        s = np.array([(position[0], position[1])])
        if attention > 0:
            q = self.model.predict([state, state])[0]
        elif attention == 0:
            q = self.model.predict([state, state, s])[0]
        else:
            q = self.model.predict(state)[0]
        return np.argmax(q)

    def visualize(self, observation, position, reward, attention=1, visualize=None):
        """
        Choose the next action to take.
        
        Args:
            observation: observable state for the current timestep. 
            reward: reward received at the beginning of the current timestep.

        Returns:
            The index of the action to take next.
        """
        state = self.get_last_frames(observation)
        s = np.array([(position[0], position[1])])
        if attention > 0:
            q = self.model.predict([state, state])[0]
            return visualize([state])
        else:
            q = self.model.predict([state, state, s])[0]
            return visualize([state, s]) , np.argmax(q)

Example #7

Show file

class DeepQNetworkAgent(AgentBase):
    """ Represents a Snake agent powered by DQN with experience replay. """
    def __init__(self,
                 model,
                 env_shape,
                 num_actions,
                 num_last_frames=4,
                 memory_size=1000):
        """
        Create a new DQN-based agent.

        Args:
            model: a DQN model.
            env_shape (int, int): shape of the environment.
            num_actions (int): number of actions.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited).
        """
        self.model = model
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.001)

        self.num_last_frames = num_last_frames
        self.memory = ExperienceReplay((num_last_frames, ) + env_shape,
                                       num_actions, memory_size)
        self.frames = None

    def begin_episode(self):
        """ Reset the agent for a new episode. """
        self.frames = None

    def get_last_frames(self, observation):
        """
        Get the pixels of the last `num_last_frames` observations, the current frame being the last.

        Args:
            observation: observation at the current timestep.

        Returns:
            Observations for the last `num_last_frames` frames.
        """
        frame = observation
        if self.frames is None:
            self.frames = collections.deque([frame] * self.num_last_frames)
        else:
            self.frames.append(frame)
            self.frames.popleft()
        return np.expand_dims(self.frames, 0)

    def train(
            self,
            env,
            num_episodes=1000,
            batch_size=50,
            discount_factor=0.9,
            checkpoint_freq=None,
            exploration_range=(1.0, 0.1),
            exploration_phase_size=0.5,
    ):
        """
        Train the agent to perform well in the given Snake environment.

        Args:
            env:
                an instance of Snake environment.
            num_episodes (int):
                the number of episodes to run during the training.
            batch_size (int):
                the size of the learning sample for experience replay.
            discount_factor (float):
                discount factor (gamma) for computing the value function.
            checkpoint_freq (int):
                the number of episodes after which a new model checkpoint will be created.
            exploration_range (tuple):
                a (max, min) range specifying how the exploration rate should decay over time.
            exploration_phase_size (float):
                the percentage of the training process at which
                the exploration rate should reach its minimum.
        """

        # Calculate the constant exploration decay speed for each episode.
        max_exploration_rate, min_exploration_rate = exploration_range
        exploration_decay = (max_exploration_rate - min_exploration_rate) / (
            num_episodes * exploration_phase_size)
        exploration_rate = max_exploration_rate

        for episode in range(num_episodes):
            # Reset the environment for the new episode.
            timestep = env.new_episode()
            self.begin_episode()
            game_over = False
            loss = 0.0

            # Observe the initial state.
            state = self.get_last_frames(timestep.observation)
            while not game_over:
                if np.random.random() < exploration_rate:
                    # Explore: take a random action.
                    action = np.random.randint(env.num_actions)
                else:
                    # Exploit: take the best known action for this state.
                    q = self.model(torch.Tensor(state))
                    action = np.argmax(q[0].detach()).item()

                # Act on the environment.
                env.choose_action(action)
                timestep = env.timestep()

                # Remember a new piece of experience.
                reward = timestep.reward
                state_next = self.get_last_frames(timestep.observation)
                game_over = timestep.is_episode_end
                experience_item = [
                    state, action, reward, state_next, game_over
                ]
                self.memory.remember(*experience_item)
                state = state_next

                # Sample a random batch from experience.
                batch = self.memory.get_batch(
                    model=self.model,
                    batch_size=batch_size,
                    discount_factor=discount_factor,
                )
                # Learn on the batch.
                if batch:
                    inputs, targets = batch
                    self.optimizer.zero_grad()
                    predictions = self.model(torch.Tensor(inputs))
                    batch_loss = self.loss_fn(predictions,
                                              torch.Tensor(targets))
                    loss += batch_loss
                    # Backpropagation
                    batch_loss.backward()
                    self.optimizer.step()

            if checkpoint_freq and (episode % checkpoint_freq) == 0:
                torch.save(self.model, f"dqn-{episode:08d}.model")

            if exploration_rate > min_exploration_rate:
                exploration_rate -= exploration_decay

            summary = (
                "Episode {:5d}/{:5d} | Loss {:8.4f} | Exploration {:.2f} | " +
                "Fruits {:2d} | Timesteps {:4d} | Total Reward {:4d}")
            print(
                summary.format(
                    episode + 1,
                    num_episodes,
                    loss,
                    exploration_rate,
                    env.stats.fruits_eaten,
                    env.stats.timesteps_survived,
                    env.stats.sum_episode_rewards,
                ))

        torch.save(self.model, "dqn-final.model")

    def act(self, observation, reward):
        """
        Choose the next action to take.

        Args:
            observation: observable state for the current timestep.
            reward: reward received at the beginning of the current timestep.

        Returns:
            The index of the action to take next.
        """
        state = self.get_last_frames(observation)
        with torch.no_grad():
            q = self.model(torch.Tensor(state))
        action = np.argmax(q[0]).item()
        return action

Example #8

Show file

class DeepQNetworkAgent(AgentBase):
    """ Represents a Snake agent powered by DQN with experience replay. """
    def __init__(self, model, num_last_frames=4, memory_size=1000, output="."):
        """
        Create a new DQN-based agent.
        
        Args:
            model: a compiled DQN model.
            num_last_frames (int): the number of last frames the agent will consider.
            memory_size (int): memory size limit for experience replay (-1 for unlimited). 
            output (str): folder path to output model files.
        """
        assert model[0].input_shape[
            1] == num_last_frames, 'Model input shape should be (num_frames, grid_size, grid_size)'
        assert len(
            model[0].output_shape
        ) == 2, 'Model output shape should be (num_samples, num_actions)'

        self.model = model
        self.num_last_frames = num_last_frames
        self.memory = ExperienceReplay(
            (num_last_frames, ) + model[0].input_shape[-2:],
            model[0].output_shape[-1], memory_size)
        self.frames = None
        self.output = output
        self.num_frames = 0
        self.num_trained_frames = 0

    def begin_episode(self):
        """ Reset the agent for a new episode. """
        self.frames = None

    def get_last_frames(self, observation):
        """
        Get the pixels of the last `num_last_frames` observations, the current frame being the last.
        
        Args:
            observation: observation at the current timestep. 

        Returns:
            Observations for the last `num_last_frames` frames.
        """
        frame = observation
        if self.frames is None:
            self.frames = collections.deque([frame] * self.num_last_frames)
        else:
            self.frames.append(frame)
            self.frames.popleft()
        return np.expand_dims(self.frames, 0)

    def train(self,
              env,
              num_episodes=1000,
              batch_size=50,
              discount_factor=0.9,
              checkpoint_freq=None,
              method='dqn',
              multi_step='False'):
        """
        Train the agent to perform well in the given Snake environment.
        
        Args:
            env:
                an instance of Snake environment.
            num_episodes (int):
                the number of episodes to run during the training.
            batch_size (int):
                the size of the learning sample for experience replay.
            discount_factor (float):
                discount factor (gamma) for computing the value function.
            checkpoint_freq (int):
                the number of episodes after which a new model checkpoint will be created.
        """
        timestamp = time.strftime('%Y%m%d-%H%M%S')

        episode = 0
        while episode != num_episodes:
            episode += 1
            exploration_rate = 1 - 0.00009 * episode if episode < 10000 else (
                10 / np.sqrt(episode))

            # Reset the environment for the new episode.
            timestep = env.new_episode()
            self.begin_episode()
            game_over = False
            loss = 0.0
            model_to_udate = np.random.randint(0, 2) if method == 'ddqn' else 0

            # Observe the initial state.
            state = self.get_last_frames(timestep.observation)

            while not game_over:
                if np.random.random() < exploration_rate:
                    # Explore: take a random action.
                    action = np.random.randint(env.num_actions)
                else:
                    # Exploit: take the best known action for this state.
                    q = self.model[model_to_udate].predict(state)
                    action = np.argmax(q[0])

                # Act on the environment.
                env.choose_action(action)
                timestep = env.timestep()

                # Remember a new piece of experience.
                reward = timestep.reward
                state_next = self.get_last_frames(timestep.observation)

                if np.random.random() < exploration_rate:
                    # Explore: take a random action.
                    action_next = np.random.randint(env.num_actions)
                else:
                    # Exploit: take the best known action for this state.
                    q = self.model[model_to_udate].predict(state_next)
                    action_next = np.argmax(q[0])

                game_over = timestep.is_episode_end
                experience_item = [
                    state, action, reward, state_next, action_next, game_over
                ]
                self.memory.remember(*experience_item)
                state = state_next

                # Sample a random batch from experience.
                batch = self.memory.get_batch(
                    model=self.model,
                    batch_size=batch_size,
                    exploration_rate=exploration_rate,
                    discount_factor=discount_factor,
                    method=method,
                    model_to_udate=model_to_udate,
                    multi_step=multi_step)

                # Learn on the batch.
                if batch:
                    inputs, targets = batch
                    self.num_trained_frames += targets.size
                    loss += float(self.model[model_to_udate].train_on_batch(
                        inputs, targets))

                if Config.PRIORITIZED_REPLAY:
                    # Sample a random batch from experience.
                    batch = self.memory.get_batch(
                        model=self.model,
                        batch_size=batch_size,
                        exploration_rate=exploration_rate,
                        discount_factor=discount_factor,
                        method=method,
                        model_to_udate=model_to_udate,
                        multi_step=multi_step,
                        get_latest_replay=True)

                    # Learn on the batch.
                    if batch:
                        inputs, targets = batch
                        self.num_trained_frames += targets.size
                        replay_loss = float(
                            self.model[model_to_udate].train_on_batch(
                                inputs, targets))
                        input_loss = np.minimum(10, int(replay_loss))
                        self.memory.remember_prioritized_ratio(
                            np.ceil(
                                np.power(input_loss + 1,
                                         Config.PRIORITIZED_RATING)))

                        with open(f'{self.output}/training-loss.txt',
                                  'a') as f:
                            with redirect_stdout(f):
                                print(episode, self.num_frames, replay_loss)
                        f.close()

            if checkpoint_freq and (episode % checkpoint_freq) == 0:
                self.model[0].save(f'{self.output}/dqn-{episode:08d}.model')
                self.evaluate(env,
                              trained_episode=episode,
                              num_test_episode=15)

            self.num_frames += env.stats.timesteps_survived

            summary = 'Episode {:5d}/{:5d} | Loss {:8.4f} | Exploration {:.3f} | ' + \
                      'Fruits {:2d} | Timesteps {:4d} | Reward {:4d} | ' + \
                      'Memory {:6d} | Total Timesteps {:6d} | Trained Frames{:11d}'

            print(
                summary.format(episode + 1, num_episodes, loss,
                               exploration_rate, env.stats.fruits_eaten,
                               env.stats.timesteps_survived,
                               env.stats.sum_episode_rewards,
                               len(self.memory.memory), self.num_frames,
                               self.num_trained_frames))
            with open(f'{self.output}/training-log.txt', 'a') as f:
                with redirect_stdout(f):
                    print(
                        summary.format(episode + 1, num_episodes, loss,
                                       exploration_rate,
                                       env.stats.fruits_eaten,
                                       env.stats.timesteps_survived,
                                       env.stats.sum_episode_rewards,
                                       len(self.memory.memory),
                                       self.num_frames,
                                       self.num_trained_frames))
            f.close()

        self.model[0].save(f'{self.output}/dqn-final.model')
        self.evaluate(env, trained_episode=episode, num_test_episode=15)
        print('Training End - saved to ' + str(self.output))

    def act(self, observation, reward):
        """
        Choose the next action to take.
        
        Args:
            observation: observable state for the current timestep. 
            reward: reward received at the beginning of the current timestep.

        Returns:
            The index of the action to take next.
        """
        state = self.get_last_frames(observation)
        q = self.model[0].predict(state)[0]
        return np.argmax(q)

    def evaluate(self, env, trained_episode, num_test_episode):
        """
        Play a set of episodes using the specified Snake agent.
        Use the non-interactive command-line interface and print the summary statistics afterwards.
        
        Args:
            env: an instance of Snake environment.
            trained_episode (int): trained episodes.
            num_test_episode (int): the number of episodes to run.
        """

        fruit_stats = []
        timestep_stats = []
        reward_stats = []

        print()
        print('Playing:')

        for episode in range(num_test_episode):
            timestep = env.new_episode()
            self.begin_episode()
            game_over = False

            while not game_over:
                action = self.act(timestep.observation, timestep.reward)
                env.choose_action(action)
                timestep = env.timestep()
                game_over = timestep.is_episode_end

            fruit_stats.append(env.stats.fruits_eaten)
            timestep_stats.append(env.stats.timesteps_survived)
            reward_stats.append(env.stats.sum_episode_rewards)

            summary = 'Episode {:3d} / {:3d} | Timesteps {:4d} | Fruits {:2d} | Reward {:3d}'
            print(summary.format(episode + 1, num_test_episode, env.stats.timesteps_survived, +\
            env.stats.fruits_eaten, env.stats.sum_episode_rewards))

        print('Fruits eaten {:.1f} +/- stddev {:.1f}'.format(
            np.mean(fruit_stats), np.std(fruit_stats)))
        print('Reward {:.1f} +/- stddev {:.1f}'.format(np.mean(reward_stats),
                                                       np.std(reward_stats)))
        print()

        with open(f'{self.output}/training-stat.txt', 'a') as f:
            with redirect_stdout(f):
                summary = 'Episode {:7d} | Average Timesteps {:4.0f} | Average Fruits {:.1f} | Average Reward {:.1f}'
                print(
                    summary.format(trained_episode, np.mean(timestep_stats),
                                   np.mean(fruit_stats),
                                   np.mean(reward_stats)))