Ejemplo n.º 1
0
class DDPGAgent(object):
    def __init__(self,
                 alpha,
                 beta,
                 tau,
                 gamma,
                 state_space,
                 l1_size,
                 l2_size,
                 l3_size,
                 l4_size,
                 action_space,
                 env,
                 brain_name,
                 multibrain,
                 version,
                 mem_capacity=1e6,
                 batch_size=128,
                 multiagent=False,
                 n_agents=None,
                 eval=False):

        # Initialize memory
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_capacity)

        # Initialize noise
        # In case of a multiagent environment, create a separate noise object for each agent
        self.noise = [OUActionNoise(np.zeros(action_space)) for i in range(n_agents)] if multiagent else \
                    OUActionNoise(np.zeros(action_space))

        # Setup device used for torch computations
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        # Create actor critic and target networks
        self.actor = ActorNet(alpha,
                              state_space,
                              l1_size,
                              l2_size,
                              l3_size,
                              l4_size,
                              action_space,
                              name='actor_' + version + '_ddpg_model').to(
                                  self.device)
        self.target_actor = ActorNet(alpha, state_space, l1_size, l2_size,
                                     l3_size, l4_size,
                                     action_space).to(self.device)

        self.critic = CriticNet(beta,
                                state_space,
                                l1_size,
                                l2_size,
                                l3_size,
                                l4_size,
                                action_space,
                                name='critic_' + version + '_ddpg_model').to(
                                    self.device)
        self.target_critic = CriticNet(beta, state_space, l1_size, l2_size,
                                       l3_size, l4_size,
                                       action_space).to(self.device)

        # Initialize target nets to be identical to actor and critic networks
        self.init_networks()

        # Target networks set to eval, since they are not
        # trained but simply updated with the target_network_update function
        self.target_actor.eval()
        self.target_critic.eval()

        # Set global parameters
        self.gamma = gamma
        self.env = env
        self.tau = tau
        self.eval = eval
        self.state_space = state_space
        self.action_space = action_space
        self.multiagent = multiagent
        self.multibrain = multibrain
        self.brain_name = brain_name
        self.n_agents = n_agents if self.multiagent else None

        # Initialize plotter for showing live training graphs and saving them
        self.plotter = RLPlots('ddpg_training')

    # Makes target network params identical to actor and critic network params
    def init_networks(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    # Saves models
    def save_models(self):
        self.actor.save_model()
        self.critic.save_model()

    # Exports an onnx model of the actor network
    def save_onnx_model(self):
        # Create a dummy input to pass through the model and export the current actor model
        dummy_input = torch.autograd.Variable(
            torch.randn(1, 1, 1, self.state_space)).to(self.device)
        torch.onnx.export(self.actor,
                          dummy_input,
                          './models/ddpg_model.onnx',
                          verbose=True,
                          input_names=['vector_observation'],
                          output_names=['action'])
        print('ONNX model saved to ./models/ddpg_model.onnx')

    # Passes the state through the actor network to get actions
    def choose_action(self, state):
        # Assure actor is set to eval mode as we do not want to train when taking actions
        self.actor.eval()

        # Convert state to tensor and send to device
        state = torch.tensor(state).float().to(self.device)

        # Pass state through actor network to get actions
        actions = self.actor(state)

        # In case of multiagent environment, stack a noise vector
        # to have the same shape as the actions matrix (n_agents x num_actions)
        if self.multiagent:
            noise = np.vstack(tuple(ounoise() for ounoise in self.noise))
            noise = torch.tensor(noise).float().to(self.device)
        else:
            noise = torch.tensor(self.noise()).float().to(self.device)

        # Add noise to actions to get final mu action values
        actions = actions + noise

        # Send to cpu, detach from computation graph and convert to list
        # for the gym environment
        return actions.cpu().detach().numpy()

    # Stores transitions (single or multiple depending on multiagent environment)
    def store_transitions(self, state, action, reward, state_, done):

        if self.multiagent:
            for i in range(self.n_agents):
                self.memory.add_transition(state[i], action[i], reward[i],
                                           state_[i], int(done[i]))
        else:
            self.memory.add_transition(state, action, reward, state_,
                                       int(done))

    # This function samples a batch of transitions from memory, and
    # puts them onto the device defined by self.device
    def sample_transitions(self):
        batch = self.memory.sample_batch(self.batch_size)

        # 'reward' and 'done' are unsequeezed to get the right dimensions, (batch_size, 1) instead of (batch_size)
        state = torch.tensor(batch.state).float().to(self.device)
        action = torch.tensor(batch.action).float().to(self.device)
        reward = torch.tensor(batch.reward).unsqueeze(dim=1).float().to(
            self.device)
        state_ = torch.tensor(batch.state_).float().to(self.device)
        done = torch.tensor(batch.done).unsqueeze(dim=1).float().to(
            self.device)

        return state, action, reward, state_, done

    def learn(self):
        # Only start learning when there's enough transitions stored for a full batch
        if self.memory.pointer < self.batch_size:
            return

        # Sample random batch of transitions
        state, action, reward, state_, done = self.sample_transitions()

        # We first handle the critic update, then the actor update
        # To evaluate the critic net, we set actor to eval, critic to train mode
        self.actor.eval()
        self.critic.train()

        # Order of operation is:
        # 1) zero_grad > 2) forward pass > 3) loss > 4) backward pass > 5) optimizer step

        # 1) Zero grad on Critic
        self.critic.optim.zero_grad()

        # 2) Forward pass
        # Calculate critic predicted and target values
        critic_pred = self.critic(state, action)
        critic_target = reward + self.gamma * self.target_critic(
            state_, self.target_actor(state_)) * (1 - done)

        # 3) Calculate loss
        # Calulcate critic loss, then perform backprop
        critic_loss = self.critic.loss(critic_pred, critic_target)

        # 4) Backward pass
        critic_loss.backward()

        # 5) Optimizer step
        self.critic.optim.step()

        # Now switch train and eval mode again on actor and critic to
        # do the actor update
        self.actor.train()
        self.critic.eval()

        # 1) Zero grad on actor
        self.actor.optim.zero_grad()

        # 2) Forward pass
        # Calculate actor loss and perform backprop
        mu = self.actor(state)

        # 3) Calculate loss
        actor_loss = torch.mean(-self.critic(state, mu))

        # 4) Backward pass
        actor_loss.backward()

        # 5) Optimizer step
        self.actor.optim.step()

        # Set actor back to eval mode
        self.actor.eval()

        # Update the target networks
        self.update_target_networks()

    # Steps through the environment with the given action and returns the next state, reward, done flag
    # whether or not the max_steps has been reached. Also returns visual observation for saving gifs of episodes
    def env_step(self, actions):
        vis_obs = None

        if self.multibrain:
            # In case of multibrain, we provide the actions as a dictionary, with the brain
            # name being the key, and actions as the value
            actions = {
                self.brain_name[0]: actions[1:],
                self.brain_name[1]: actions[0]
            }
            env_info = self.env.step(actions)
            env_info_b0 = env_info[self.brain_name[1]]
            env_info_b1 = env_info[self.brain_name[0]]

            # Stack and concatenate the next state, rewards, done and max reached flags for the different brains
            state_ = np.vstack((env_info_b0.vector_observations,
                                env_info_b1.vector_observations))
            rewards = env_info_b0.rewards + env_info_b1.rewards
            done = env_info_b0.local_done + env_info_b1.local_done
            max_reached = env_info_b0.max_reached + env_info_b1.max_reached

            # Collect visual observation from 'WalkerVis' brain
            vis_obs = env_info_b0.visual_observations[0][0]
        else:
            # In case of single brain, simply collect next state, rewards, done and max reached flags
            env_info = self.env.step(actions)[self.brain_name]
            state_ = env_info.vector_observations
            rewards = env_info.rewards
            done = env_info.local_done
            max_reached = env_info.max_reached

        return state_, rewards, done, max_reached, vis_obs

    # Trains either a multi or single agent environment
    # eval is whether or not to simply play the game or to actually train the agent
    def train(self, num_eps):
        if self.eval:
            self.actor.eval()
            self.critic.eval()

        if self.multiagent:
            self.train_multi(num_eps)
        else:
            self.train_single(num_eps)

    # Training loop for environment with single agent
    def train_single(self, num_eps):

        # Keep track of scores
        scores = []
        avg_ph_scores = []

        # Save interval
        save_interval = 100

        # Plot interval
        plot_interval = 200

        # The minimum score to save a gif
        gif_score_threshold = 100

        # Play number of episodes
        for ep_num in range(num_eps):

            done = False
            state = self.env.reset()[self.brain_name].vector_observations

            ep_score = 0

            # Initialize frame list for saving gifs
            frames = []

            # Keep playing until done
            while not done:
                # Pick action using actor network
                actions = self.choose_action(state)
                # Take action and observe next state and reward
                state_, rewards, done, _, vis_obs = self.env_step(actions)
                # Store transition into memory
                self.store_transitions(state, actions, rewards, state_, done)
                # Sample batch of transitions and train networks (if eval mode is off)
                if not self.eval:
                    self.learn()
                else:
                    frames.append(vis_obs)

                ep_score += rewards

                # Set next state to now be the current state
                state = state_

            print(
                f'Episode: {ep_num}\n\tScore: {ep_score}\n\tAvg past 100 score: {np.mean(scores[-100:])}'
            )

            scores.append(ep_score)
            avg_ph_scores.append(np.mean(scores[-100:]))

            # Reset noise each episode
            self.reset_noise()

            # Save models every save_interval steps
            if ep_num % save_interval == 0:
                self.save_models()

            # Plots average rewards every plot_interval steps
            if ep_num % plot_interval == 0:
                self.plot_rewards(avg_ph_scores)

            # Record a gif if it's larger than gif score threshold
            # or if it's larger than previously achieved score
            if self.eval and ep_score > gif_score_threshold:
                self.save_gif(frames)
                gif_score_threshold = ep_score

        # Save the final plot
        self.plot_rewards(avg_ph_scores, save=True)

        # Close environment
        self.env.close()

    # Training loop for environment with multiple agents
    def train_multi(self, num_eps):

        # Keep track of scores
        scores = []
        avg_ph_scores = []

        # Save interval
        save_interval = 100

        # Plot interval
        plot_interval = 250

        # The minimum score to save a gif
        gif_score_threshold = 100

        # Play number of episodes
        for ep_num in range(num_eps):
            done = [False for i in range(self.n_agents)]
            # In case of multibrain environments, stack the initial state vectors of the brains
            if self.multibrain:
                env_init = self.env.reset()
                state_one = env_init[self.brain_name[0]].vector_observations
                state = env_init[self.brain_name[1]].vector_observations
                state = np.vstack((state, state_one))
            else:
                state = self.env.reset()[self.brain_name].vector_observations

            ep_score = 0

            # Initialize frame list for saving gifs
            frames = []

            # Keep playing until one of the agents is done
            while True not in done:
                # Pick action using actor network
                actions = self.choose_action(state)
                # Take action and observe next state and reward
                state_, rewards, done, _, vis_obs = self.env_step(actions)
                # Store transition into memory
                self.store_transitions(state, actions, rewards, state_, done)
                # Sample batch of transitions and train networks (if eval mode is off)
                if not self.eval:
                    self.learn()
                else:
                    frames.append(vis_obs)

                ep_score += np.mean(rewards)

                # Set next state to now be the current state
                state = state_

            print(
                f'Episode: {ep_num}\n\tScore: {ep_score}\n\tAvg past 100 score: {np.mean(scores[-100:])}'
            )

            scores.append(ep_score)
            avg_ph_scores.append(np.mean(scores[-100:]))

            # Reset noise each episode
            self.reset_noise()

            # Save models every save_interval steps
            if ep_num % save_interval == 0 and self.eval is not True:
                self.save_models()

            # Plots average rewards every plot_interval steps
            if ep_num % plot_interval == 0:
                self.plot_rewards(avg_ph_scores)

            # Record a gif if it's larger than gif score threshold
            # or if it's larger than previously achieved score
            if self.eval and ep_score > gif_score_threshold:
                self.save_gif(frames)
                gif_score_threshold = ep_score

        # Save the final plot
        self.plot_rewards(avg_ph_scores, save=True)

        # Close environment
        self.env.close()

    # This function updates the target networks according to the DDPG algorithm
    # theta^q' = tau * theta^q + (1-tau) * theta^q'
    # theta^mu' = tau * theta^mu + (1-tau) * theta^mu'
    # Where q and q' are the critic and target critic networks
    # and mu and mu' are the actor and target actor networks respectively
    def update_target_networks(self):
        # Load all four state dictionaries for the networks
        actor_params = self.actor.state_dict()
        target_actor_params = self.target_actor.state_dict()

        critic_params = self.critic.state_dict()
        target_critic_params = self.target_critic.state_dict()

        # Load the a new state dict using a dictionary comprehension
        self.target_actor.load_state_dict({
            key: (self.tau * params) +
            (1 - self.tau) * target_actor_params[key].clone()
            for key, params in actor_params.items()
        })

        self.target_critic.load_state_dict({
            key: (self.tau * params) +
            (1 - self.tau) * target_critic_params[key].clone()
            for key, params in critic_params.items()
        })

    # Resets the noise
    def reset_noise(self):
        if self.multiagent:
            [noise.reset() for noise in self.noise]
        else:
            self.noise.reset()

    def plot_rewards(self, scores, save=False):
        self.plotter.plot_rewards(scores, save)

    # Save gif of an episode given a list of frames
    def save_gif(self, frames):
        print('Saving gif')
        frames = [
            Image.fromarray((frame * 255).astype(dtype=np.uint8))
            for frame in frames
        ]
        frames[0].save('episode.gif',
                       format='GIF',
                       append_images=frames[1:],
                       save_all=True,
                       duration=100,
                       loop=0)
Ejemplo n.º 2
0
class DDPGAgent(object):
    def __init__(self, alpha, beta, input_dims, tau, env, brain_name, gamma=.99,
                 n_actions=2, mem_capacity=1e6, layer1_size=400,
                 layer2_size=300, batch_size=64, multiagent=False,
                 n_agents=None, game_name='Rollerball'):

        # Initialize memory
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_capacity)
        
        # Initialize noise
        self.noise = OUActionNoise(np.zeros(n_actions))

        # Setup device used for torch computations
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        # Create actor critic and target networks
        self.actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions, name='actor_' + game_name + '_ddpg_model').to(self.device)
        self.target_actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions).to(self.device)

        self.critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions, name='critic_' + game_name + '_ddpg_model').to(self.device)
        self.target_critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions).to(self.device)
        
        # Initialize target nets to be identical to actor and critic networks
        self.init_networks()

        # Target networks set to eval, since they are not 
        # trained but simply updated with the target_network_update function
        self.target_actor.eval()
        self.target_critic.eval()

        # Set global parameters
        self.gamma = gamma
        self.env = env
        self.tau = tau
        self.state_space = input_dims
        self.action_space = n_actions
        self.multiagent = multiagent
        self.brain_name = brain_name
        if self.multiagent:
            self.n_agents = n_agents

        # Plotter object for showing live training graphs and saving them
        self.plotter = RLPlots('ddpg_training')

    def init_networks(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def save_models(self):
        self.actor.save_model()
        self.critic.save_model()

    def save_onnx_model(self):
        dummy_input = torch.autograd.Variable(torch.randn(1, self.state_space)).to(self.device)
        torch.onnx.export(self.actor, dummy_input, './models/ddpg_onnx_model.onnx', verbose=True)

    # Passes the state through the actor network to get actions
    def choose_action(self, state):
        # Assure actor is set to eval mode as we do not want to train when taking actions
        self.actor.eval()

        # Convert state to tensor and send to device
        state = torch.tensor(state).float().to(self.device)

        # Pass state through actor network to get actions
        actions = self.actor(state)

        # In case of multiagent environment, stack the noise vector
        # to have the same shape as the actions matrix (n_agents x num_actions)
        if self.multiagent:
            noise = np.tile(self.noise(), (self.n_agents, 1))
            noise = torch.tensor(noise).float().to(self.device)
        else:
            noise = torch.tensor(self.noise()).float().to(self.device)

        # Add noise to actions to get final mu action values
        actions = actions + noise

        # Send to cpu, detach from computation graph and convert to list
        # for the gym environment
        return actions.cpu().detach().tolist()

    # Stores transitions (single or multiple depending on multiagent environment)
    def store_transitions(self, state, action, reward, state_, done):
        if self.multiagent:
            for i in range(self.n_agents):
                self.memory.add_transition(state[i], action[i], reward[i], state_[i], int(done[i]))
        else:
            self.memory.add_transition(state, action, reward, state_, int(done))

    # This function samples a batch of transitions from memory, and 
    # puts them onto the device defined by self.device
    def sample_transitions(self):
        batch = self.memory.sample_batch(self.batch_size)

        # 'reward' and 'done' are unsequeezed to get the right dimensions, (batch_size, 1) instead of (batch_size)
        state = torch.tensor(batch.state).float().to(self.device)
        action = torch.tensor(batch.action).float().to(self.device)
        reward = torch.tensor(batch.reward).unsqueeze(dim=1).float().to(self.device)
        state_ = torch.tensor(batch.state_).float().to(self.device)
        done = torch.tensor(batch.done).unsqueeze(dim=1).float().to(self.device)

        return state, action, reward, state_, done

    def learn(self):
        # Only start learning when there's enough transitions stored for a full batch
        if self.memory.pointer < self.batch_size:
            return

        # Sample random batch of transitions
        state, action, reward, state_, done = self.sample_transitions()

        # We first handle the critic update, then the actor update
        # To evaluate the critic net, we set actor to eval, critic to train mode
        self.actor.eval()
        self.critic.train()

        # Order of operation is: 
        # 1) zero_grad > 2) forward pass > 3) loss > 4) backward pass > 5) optimizer step
        
        # 1) Zero grad on Critic
        self.critic.optim.zero_grad()

        # 2) Forward pass 
        # Calculate critic predicted and target values
        critic_pred = self.critic(state, action)
        critic_target = reward + self.gamma * self.target_critic(state_, self.target_actor(state_)) * (1 - done)
        
        # 3) Calculate loss
        # Calulcate critic loss, then perform backprop
        critic_loss = self.critic.loss(critic_pred, critic_target)
        
        # 4) Backward pass
        critic_loss.backward()
        
        # 5) Optimizer step
        self.critic.optim.step()

        # Now switch train and eval mode again on actor and critic to
        # do the actor update
        self.actor.train()
        self.critic.eval()

        # 1) Zero grad on actor
        self.actor.optim.zero_grad()
        
        # 2) Forward pass
        # Calculate actor loss and perform backprop
        mu = self.actor(state)
        
        # 3) Calculate loss
        actor_loss = torch.mean(-self.critic(state, mu))
        
        # 4) Backward pass
        actor_loss.backward()
        
        # 5) Optimizer step
        self.actor.optim.step()

        # Set actor back to eval mode
        self.actor.eval()

        # Update the target networks
        self.update_target_networks()

    # Steps through the environment with the given action and returns the next state, reward, done flag 
    # whether or not the max_steps has been reached
    def env_step(self, actions):
        env_info = self.env.step(actions)[self.brain_name]
        state_ = env_info.vector_observations
        rewards = env_info.rewards
        done = env_info.local_done
        max_reached = env_info.max_reached

        return state_, rewards, done, max_reached

    # Trains either a multi or single agent environment
    # eval is whether or not to simply play the game or to actually train the agent
    def train(self, num_eps, eval=False):
        if eval:
            self.actor.eval()
            self.critic.eval()

        if self.multiagent:
            self.train_multi(num_eps, eval)
        else:
            self.train_single(num_eps, eval)

    # Training loop for environment with single agent
    def train_single(self, num_eps, eval):
        
        # Keep track of scores
        scores = []
        avg_ph_scores = []

        # Save interval
        save_interval = 100

        # Plot interval
        plot_interval = 25

        # Play number of episodes
        for ep_num in range(num_eps):
            done = False
            state = self.env.reset()[self.brain_name].vector_observations

            score = 0

            # Keep playing until done
            while not done:
                # Pick action using actor network
                actions = self.choose_action(state)
                # Take action and observe next state and reward
                state_, rewards, done, _ = self.env_step(actions)
                # Store transition into memory
                self.store_transitions(state, actions, rewards, state_, done)
                # Sample batch of transitions and train networks (if eval mode is off)
                if not eval:
                    self.learn()

                if self.multiagent:
                    score += np.mean(rewards)
                else:
                    score += rewards

                state = state_

            print(f'Episode: {ep_num}\n\tScore: {score}\n\tAvg past 100 score: {np.mean(scores[-100:])}')

            scores.append(score)
            avg_ph_scores.append(np.mean(scores[-100:]))

            if ep_num % save_interval == 0:
                self.save_models()

            if ep_num % plot_interval == 0:
                self.plot_rewards(avg_ph_scores)
        
        self.plot_rewards(avg_ph_scores, save=True)

        self.env.close()

    # Training loop for environment with multiple agents
    def train_multi(self, num_eps, eval):
        
        # Keep track of scores
        scores = []
        avg_ph_scores = []

        # Save interval
        save_interval = 100

        # Plot interval
        plot_interval = 20

        # Play number of episodes
        for ep_num in range(num_eps):
            done = [False for i in range(self.n_agents)]

            state = self.env.reset()[self.brain_name].vector_observations

            score = 0

            # Keep playing until one of the agents is done
            while True not in done:
                # Pick action using actor network
                actions = self.choose_action(state)
                # Take action and observe next state and reward
                state_, rewards, done, _ = self.env_step(actions)
                # Store transition into memory
                self.store_transitions(state, actions, rewards, state_, done)
                # Sample batch of transitions and train networks (if eval mode is off)
                if not eval:
                    self.learn()
                
                score += np.mean(rewards)

                state = state_

            print(f'Episode: {ep_num}\n\tScore: {score}\n\tAvg past 100 score: {np.mean(scores[-100:])}')

            scores.append(score)
            avg_ph_scores.append(np.mean(scores[-100:]))

            # Save models every save_interval steps
            if ep_num % save_interval == 0:
                self.save_models()

            # Plots average rewards every plot_interval steps
            if ep_num % plot_interval == 0:
                self.plot_rewards(avg_ph_scores)
        
        # Save the final plot
        self.plot_rewards(avg_ph_scores, save=True)

        # Close environment
        self.env.close()

    # This function updates the target networks according to the DDPG algorithm
    # theta^q' = tau * theta^q + (1-tau) * theta^q'
    # theta^mu' = tau * theta^mu + (1-tau) * theta^mu'
    # Where q and q' are the critic and target critic networks
    # and mu and mu' are the actor and target actor networks respectively
    def update_target_networks(self):
        # Load all four state dictionaries for the networks
        actor_params = self.actor.state_dict()
        target_actor_params = self.target_actor.state_dict()

        critic_params = self.critic.state_dict()
        target_critic_params = self.target_critic.state_dict()

        # Load the a new state dict using a dictionary comprehension
        self.target_actor.load_state_dict({ key:
            (self.tau * params) + (1 - self.tau) * target_actor_params[key].clone()
            for key, params in actor_params.items()
        })

        self.target_critic.load_state_dict({ key:
            (self.tau * params) + (1 - self.tau) * target_critic_params[key].clone()
            for key, params in critic_params.items()
        })

    # For plotting learning progress
    def plot_rewards(self, scores, save=False):
        self.plotter.plot_rewards(scores, save)