class MaddpgAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):

        self.agents = [
            Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed),
            Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed)
        ]

        self.seed = random.seed(random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # self.soft_update(self.critic_local, self.critic_target, 1)
        # self.soft_update(self.actor_local, self.actor_target, 1)

    def act(self, states, add_noise=True):
        actions = [
            agent.act(state, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):

        # Shared replay buffer
        for i, _ in enumerate(self.agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        for agent in self.agents:
            agent.learn(experiences, gamma)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def save_checkpont(self):
        for i, agent in enumerate(self.agents):
            agent.save_checkpont(i)
 def test_len(self):
     """ Simple test for length."""
     for b in sample(range(100), 3):
         for bs in sample(range(100), 3):
             rb = ReplayBuffer(buffer_size=b, batch_size=bs)
             # len at beginning is 0
             self.assertEqual(len(rb), 0)
             # after adding 1 element, length is 1
             rb.add(state=1, action=1, reward=1, next_state=1, done=1)
             self.assertEqual(len(rb), 1)
class Maddpg():
    '''MADDPG Agent : Interacts with and learns from the environment'''
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Instantiate Multiple  Agent
        self.agents = [
            Agent(state_size, action_size, random_seed, num_agents)
            for i in range(num_agents)
        ]

        # Instantiate Memory replay Buffer (shared between agents)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def reset(self):
        '''reset agents'''
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise):
        '''Return action to perform for each agents (per policy)'''
        return [
            agent.act(state, noise)
            for agent, state in zip(self.agents, states)
        ]

    def step(self, states, actions, rewards, next_states, dones,
             num_current_episode):
        '''Save experience in replay memory, and use random sample from buffer to learn'''
        self.memory.add(encode(states), encode(actions), rewards,
                        encode(next_states), dones)

        # If enough samples in the replay memory and if it is time to update
        if (len(self.memory) > BATCH_SIZE) and (num_current_episode %
                                                UPDATE_EVERY_NB_EPISODE == 0):

            # Note: this code only expects 2 agents
            assert (len(self.agents) == 2)

            # Allow to learn several time in a row in the same episode
            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                # Sample a batch of experience from the replay buffer
                experiences = self.memory.sample()
                # Update Agent #0
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                # Sample another batch of experience from the replay buffer
                experiences = self.memory.sample()
                # Update Agent #1
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)

    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):
        states, actions, rewards, next_states, dones = experiences
        # Filter out the agent OWN states, actions and next_states batch
        own_states = decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx,
                             actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx,
                                 next_states)
        # Filter out the OTHER agent states, actions and next_states batch
        other_states = decode(self.state_size, self.num_agents, other_idx,
                              states)
        other_actions = decode(self.action_size, self.num_agents, other_idx,
                               actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx,
                                   next_states)
        # Concatenate both agent information (own agent first, other agent in second position)
        all_states = torch.cat((own_states, other_states), dim=1).to(device)
        all_actions = torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((own_next_states, other_next_states),
                                    dim=1).to(device)

        agent = self.agents[own_idx]

        # Update Critic
        # Get predicted next-state actions and Q values from target models
        all_next_actions = torch.cat(
            (agent.actor_target(own_states), agent.actor_target(other_states)),
            dim=1).to(device)
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # Update Actor
        # Compute actor loss
        all_actions_pred = torch.cat(
            (agent.actor_local(own_states),
             agent.actor_local(other_states).detach()),
            dim=1).to(device)
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # Update target networks
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)

    def checkpoints(self):
        '''Save checkpoints for all Agents'''
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(
                idx) + '.pth'
            critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(
                idx) + '.pth'
            actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(
                idx) + '.pth'
            critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(
                idx) + '.pth'
            torch.save(agent.actor_local.state_dict(), actor_local_filename)
            torch.save(agent.critic_local.state_dict(), critic_local_filename)
            torch.save(agent.actor_target.state_dict(), actor_target_filename)
            torch.save(agent.critic_target.state_dict(),
                       critic_target_filename)
class Maddpg():
    """MADDPG Agent : Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize a MADDPG Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        
        super(Maddpg, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        
        # Instantiate Multiple  Agent
        self.agents = [ Agent(state_size,action_size, random_seed, num_agents) 
                       for i in range(num_agents) ]
        
        # Instantiate Memory replay Buffer (shared between agents)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
                  
    def reset(self):
        """Reset all the agents"""
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise):
        """Return action to perform for each agents (per policy)"""        
        return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ]
                
    
    def step(self, states, actions, rewards, next_states, dones, num_current_episode):
        """ # Save experience in replay memory, and use random sample from buffer to learn"""
 
        self.memory.add(encode(states), 
                        encode(actions), 
                        rewards,
                        encode(next_states),
                        dones)

        # If enough samples in the replay memory and if it is time to update
        if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) :
            
            # Note: this code only expects 2 agents
            assert(len(self.agents)==2)
            
            # Allow to learn several time in a row in the same episode
            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                # Sample a batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #0
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                # Sample another batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #1
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)
                
    
    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):
        """
        Update the policy of the MADDPG "own" agent. The actors have only access to agent own 
        information, whereas the critics have access to all agents information.
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> action
            critic_target(all_states, all_actions) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            own_idx (int) : index of the own agent to update in self.agents
            other_idx (int) : index of the other agent to update in self.agents
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones = experiences
               
        # Filter out the agent OWN states, actions and next_states batch
        own_states =  decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx, actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) 
                
        # Filter out the OTHER agent states, actions and next_states batch
        other_states =  decode(self.state_size, self.num_agents, other_idx, states)
        other_actions = decode(self.action_size, self.num_agents, other_idx, actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states)
        
        # Concatenate both agent information (own agent first, other agent in second position)
        all_states=torch.cat((own_states, other_states), dim=1).to(device)
        all_actions=torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device)
   
        agent = self.agents[own_idx]
        
            
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models        
        all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)),
                                     dim =1).to(device) 
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()),
                                     dim = 1).to(device)      
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()
        
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()        
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)                   
    
    
                        
    def checkpoints(self):
        """Save checkpoints for all Agents"""
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'models/checkpoint_actor_local_' + str(idx) + '.pth'
            critic_local_filename = 'models/checkpoint_critic_local_' + str(idx) + '.pth'           
            actor_target_filename = 'models/checkpoint_actor_target_' + str(idx) + '.pth'
            critic_target_filename = 'models/checkpoint_critic_target_' + str(idx) + '.pth'            
            torch.save(agent.actor_local.state_dict(), actor_local_filename) 
            torch.save(agent.critic_local.state_dict(), critic_local_filename)             
            torch.save(agent.actor_target.state_dict(), actor_target_filename) 
            torch.save(agent.critic_target.state_dict(), critic_target_filename)
Example #5
0
class DDPG():
    """Reinforcement Learning agent , learning using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.08
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor 0.99
        self.tau = 0.001  # for soft update of target parameters 0.01

        # Score tracker and learning parameters
        self.total_reward = None
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None

    def reset_episode(self):

        self.total_reward = None
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        if self.total_reward:
            self.total_reward += reward
        else:
            self.total_reward = reward

        self.count += 1

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        # add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of reward tuples."""

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted actions of next-state  and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        # track best score
        self.score = self.total_reward / float(
            self.count) if self.count else -np.inf
        if self.best_score < self.score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
def train_agent(args, param):
    """

    Args:
    """
    use_gym = False
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size)
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    pathname += "_actor_300_200"
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )
    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    print("action_dim ", action_dim)
    max_action = 1
    args.max_episode_steps = 200

    if args.policy == "TD3_ad":
        policy = TD31v1(state_dim, action_dim, max_action, args)
    elif args.policy == "DDPG":
        policy = DDPG(state_dim, action_dim, max_action, args)

    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (3, 84, 84)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.device)
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file(pathname, text)
            # We evaluate the episode and we save the policy
            if total_timesteps > args.start_timesteps:
                policy.train(replay_buffer, writer, 200)
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        done = float(done)
        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 0)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
Example #7
0
class MADDPG():
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(seed)

        # Actor-Critic agents
        self.ActorCriticAgents = [
            Agent(state_size, action_size, n_agents, seed)
            for _ in range(n_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   seed)

    def OUNoise_reset(self):
        for agent in self.ActorCriticAgents:
            agent.exploration_noise.reset()

    def act(self, state):
        actions = []
        for i, agent in enumerate(self.ActorCriticAgents):
            agent_action = agent.act(state[i])
            actions.append(agent_action[0])
        return np.stack(actions, axis=0)

    def step(self, ep, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > BATCH_SIZE:
            for i in range(self.n_agents):
                self.learn(i)

    def learn(self, agent_index):
        states, actions, rewards, next_states, dones = self.memory.sample()

        target_next_actions = torch.from_numpy(
            np.zeros(shape=actions.shape)).float().to(device)
        for idx, agent in enumerate(self.ActorCriticAgents):
            current_states = states[:, idx]
            target_next_actions[:, idx, :] = agent.actor_target(current_states)

        target_next_actions = torch.reshape(target_next_actions,
                                            shape=(BATCH_SIZE, -1))

        current_agent_states = states[:, agent_index, :]
        current_agent_actions = actions[:, agent_index, :]
        current_agent_rewards = torch.reshape(rewards[:, agent_index],
                                              shape=(BATCH_SIZE, 1))
        current_agent_dones = torch.reshape(dones[:, agent_index],
                                            shape=(BATCH_SIZE, 1))

        action_preds = actions.clone()
        action_preds[:, agent_index, :] = self.ActorCriticAgents[
            agent_index].actor_local(current_agent_states)
        action_preds = torch.reshape(action_preds, shape=(BATCH_SIZE, -1))

        self.ActorCriticAgents[agent_index].update(
            states, current_agent_states, actions, current_agent_actions,
            target_next_actions, rewards, current_agent_rewards, next_states,
            dones, current_agent_dones, action_preds)

    def save_checkpoint(self):
        for i in range(self.n_agents):
            torch.save(self.ActorCriticAgents[i].actor_local.state_dict(),
                       f'actor_checkpoint{i}.pth')
            torch.save(self.ActorCriticAgents[i].critic_local.state_dict(),
                       f'critic_checkpoint{i}.pth')
Example #8
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, num_agents, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        #self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, ALPHA, BETA, ANNEAL_OVER)

        # Tensorboard interface
        self.writer = SummaryWriter(comment="-ddpg-no-pri")
        self.tb_tracker = TBMeanTracker(self.writer, batch_size=10)
        self.step_t = 0

    def step(self, state, action, reward, next_state, done, timestamp):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        
        #for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        self.memory.add(state, action, reward, next_state, done)

        # Learn at defined interval, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestamp % self.num_agents == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                self.step_t += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        #states, actions, rewards, next_states, dones, idxs, weights = experiences
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()

        critic_loss.backward()
        
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # update priorities
        # updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy()
        # self.memory.update_priorities(idxs, updates)

        self.tb_tracker.track("loss_critic", critic_loss.to("cpu"), self.step_t)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.tb_tracker.track("loss_actor", actor_loss.to("cpu"), self.step_t)

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Example #9
0
class DQNAgent_Vanila_simple(agent):
    def __init__(self, model, opt, learning=True):
        super().__init__()
        self.memory = ReplayBuffer(3000)
        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None
        self.step = 0
        self.model = model
        self.opt = opt
        self.loss = 0
        self.batch_size = 10
        self.test_q = 0
        self.max_tile = 0
        #self.test_q = 0
        self.epsilon_schedule = LinearSchedule(1000000,
                                               initial_p=0.99,
                                               final_p=0.01)
        self.learning = learning

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.step)
        return random.random() < self.epsilon

    def action(self):
        if self.learning:
            self.step += 1

        legalActions = self.legal_actions(deepcopy(self.gb.board))
        if len(legalActions) == 0:
            print(111111111111111111111111111111111111111)
        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        if self.learning and self.should_explore():
            q_values = None
            action = random.choice(legalActions)
            choice = self.actions[action]
        else:
            #mark
            state = torch.from_numpy(board).type(
                torch.FloatTensor).cuda().view(-1, 17, 4, 4)
            action, q_values = self.predict(state, legalActions)
            choice = self.actions[action]
        if self.learning:
            reward = self.gb.currentReward
            if reward != 0:
                reward = np.log2(reward)
            if (self.previous_state is not None
                    and self.previous_action is not None):
                self.memory.add(self.previous_state, self.previous_action,
                                self.previous_legal_actions, reward,
                                legalActions, board, 0)

        self.previous_state = board
        self.previous_action = action
        self.previous_legal_actions = legalActions

        if self.learning:
            self.update()
        return choice

    def enableLearning(self):
        self.model.train()
        self.learning = True
        self.max_tile = 0
        self.reset()

    def disableLearning(self):
        self.model.eval()
        self.learning = False

    def end_episode(self):
        if not self.learning:
            m = np.max(self.gb.board)
            if m > self.max_tile:
                self.max_tile = m
            return
        #print(self.gb.board)

        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        #legalActions = self.legal_actions(deepcopy(self.gb.board))
        #print(legalActions)
        self.memory.add(self.previous_state, self.previous_action,
                        self.previous_legal_actions, self.gb.currentReward, [],
                        board, 1)
        self.reset()

    def reset(self):

        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None

    def update(self):
        if self.step < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        (states, actions, legal_actions, reward, next_legal_actions,
         next_states, is_terminal) = batch

        terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor)
        reward = torch.tensor(reward).type(torch.cuda.FloatTensor)
        states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view(
            -1, 17, 4, 4)
        next_states = torch.from_numpy(next_states).type(
            torch.FloatTensor).cuda().view(-1, 17, 4, 4)
        # Current Q Values

        _, q_values = self.predict_batch(states)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)
        #print(actions)
        #print(q_values)

        q_values = q_values[batch_index, actions]
        #print(q_values)
        # Calculate target
        q_actions_next, q_values_next = self.predict_batch(
            next_states, legalActions=next_legal_actions)
        #print(q_values_next)
        q_max = q_values_next.max(1)[0].detach()

        q_max = (1 - terminal) * q_max
        # if sum(terminal == 1) > 0:
        #     print(reward)
        #     print( (terminal == 1).nonzero())
        #     print(terminal)
        #     print(next_legal_actions)
        #     print(q_max)
        #     input()
        q_target = reward + 0.99 * q_max
        self.opt.zero_grad()
        loss = self.model.loss_function(q_target, q_values)

        loss.backward()

        self.opt.step()

        #train_loss = loss_vae.item() + loss_dqn.item()

        self.loss += loss.item() / len(states)

    def predict_batch(self, input, legalActions=None):
        input = input
        #print(legalActions)

        q_values = self.model(input)
        if legalActions is None:
            values, q_actions = q_values.max(1)
        else:
            isNotlegal = True

            # print(legalActions)
            # print(q_values)
            q_values_true = torch.full((self.batch_size, 4), -100000000).cuda()
            for i, action in enumerate(legalActions):
                q_values_true[i, action] = q_values[i, action]
            values, q_actions = q_values_true.max(1)
            q_values = q_values_true
            #print(q_values_true)
            '''
            while isNotlegal:
                isNotlegal = False
                values, q_actions = q_values.max(1)
                #print(q_values)
                #print(values)
                #print(q_actions)


                for i, action in enumerate(q_actions):
                    #print(legalActions[i])
                    if len(legalActions[i]) == 0:
                        continue

                    if action.item() not in legalActions[i]:
                        isNotlegal = True
                        # print(i)
                        # print(action.item())
                        # print(q_values)
                        q_values[i, action] = -1
                #         print(q_values)
                # print("*********************")
            '''
        return q_actions, q_values

    def predict(self, input, legalActions):
        q_values = self.model(input)
        for action in range(4):
            if action not in legalActions:
                q_values[0, action] = -100000000

        action = torch.argmax(q_values)
        if int(action.item()) not in legalActions:
            print(legalActions, q_values, action)
            print("!!!!!!!!!!!!!!!!!!!!!!!!!")
        return action.item(), q_values

    def legal_actions(self, copy_gb):
        legalActions = []
        for i in range(4):
            try_gb = gameboard(4, deepcopy(copy_gb))
            changed = try_gb.takeAction(self.actions[i])
            if changed:
                legalActions.append(i)
        return legalActions

    '''
Example #10
0
class MADDPG():
    """Interacts with and learns from the environment."""
    def __init__(self, config):
        """Initialize an Agent object.

    Params
    ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        random_seed (int): random seed
    """
        self.state_size = config.state_size
        self.action_size = config.action_size
        self.seed = random.seed(config.random_seed)
        self.config = config
        self.t_step = 0
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 config.random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  config.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   config.random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    config.random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config.lr_critic,
                                           weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(self.action_size, config.random_seed)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, config.buffer_size,
                                   config.batch_size, config.random_seed)
        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        if config.shared_replay_buffer:
            self.memory = config.memory
        else:
            self.memory = config.memory_fn()

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.config.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.config.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
    Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
    where:
        actor_target(state) -> action
        critic_target(state, action) -> Q-value

    Params
    ======
        experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        gamma (float): discount factor
    """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target

    Params
    ======
        local_model: PyTorch model (weights will be copied from)
        target_model: PyTorch model (weights will be copied to)
        tau (float): interpolation parameter 
    """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #11
0
def train(args, repeat_opt):
    """

    Args:
        param1(TD3): policy
        param2(Buffer):
        param3(openai env):
    """
    use_gym = False
    # in case seed experements
    args.seed = repeat_opt
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    #args.repeat_opt = repeat_opt
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = 'env-' + str(args.env_name) + '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file('search-' + pathname, text)
    arg_text = str(args)
    write_into_file('search-' + pathname, arg_text)
    # tensorboard_name = 'runs' + str(dt_string) + '/' + pathname + "-Dueling"
    tensorboard_name = 'runs/' + pathname
    writer = SummaryWriter(tensorboard_name)
    env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64',
                           no_graphics=True)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_dim = brain.vector_action_space_size
    states = env_info.vector_observations
    state_dim = states.shape[1]
    max_action = 1
    policy = TD31v1(state_dim, action_dim, max_action, args)
    replay_buffer = ReplayBuffer()
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    file_name = "%s_%s_%s" % ("TD3", args.env_name, str(args.seed))
    print("---------------------------------------")
    print("Settings: %s" % (file_name))
    print("---------------------------------------")
    # We start the main loop over 500,000 timesteps
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file('search-' + pathname, text)
                policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if timesteps_since_eval >= args.eval_freq:
                policy.save("%s" % (file_name), directory="./pytorch_models")
                timesteps_since_eval %= args.eval_freq
                #evaluations.append(evaluate_policy(policy, writer, total_timesteps, args, episode_num))
                save_model = file_name + '-{}'.format(episode_num)
                policy.save(save_model, directory="./pytorch_models")
                np.save("./results/%s" % (file_name), evaluations)
            # When the training step is done, we reset the state of the environment
            env_info = env.reset(train_mode=True)[brain_name]
            obs = env_info.vector_observations[0]

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            action = np.random.randn(brain.vector_action_space_size)
        else:
            action = policy.select_action(np.array(obs))
            # If the explore_noise parameter is not 0, we add noise to the action and we clip it
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=action_dim)).clip(-1, 1)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        env_info = env.step(action)[
            brain_name]  # send all actions to tne environment
        new_obs = env_info.vector_observations[
            0]  # get next state (for each agent)
        reward = env_info.rewards[0]  # get reward (for each agent)
        done = env_info.local_done[0]
        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        # We increase the total reward
        reward = reward * args.reward_scalling

        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        replay_buffer.add((obs, new_obs, action, reward, done_bool))
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    if args.save_model:
        policy.save("%s" % (file_name), directory="./pytorch_models")
    np.save("./results/%s" % (file_name), evaluations)
Example #12
0
def ddqn_train(model_name,
               load_model=False,
               model_filename=None,
               optimizer_filename=None):
    print("DDQN -- Training")

    env = make('hungry_geese')
    trainer = env.train(
        ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = DDQNAgent(rows=11, columns=11, num_actions=3)
    buffer = ReplayBuffer()
    strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001)

    if load_model:
        agent.load_model_weights(model_filename)
        agent.load_optimizer_weights(optimizer_filename)

    start_episode = 0
    end_episode = 50000
    epochs = 32
    batch_size = 128

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        epsilon = strategy.get_epsilon(episode - start_episode)
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_epsilon_greedy_action(state, epsilon)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(
                env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            buffer.add(state, action, reward, next_state, done)

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

        if len(buffer) >= batch_size:
            for _ in range(epochs):
                states, actions, rewards, next_states, dones = buffer.get_samples(
                    batch_size)
                agent.fit(states, actions, rewards, next_states, dones)

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) +
              " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if episode % 10 == 0:
            agent.update_target_network()

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))
            print('Epsilon: ' + str(round(epsilon, 3)))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' +
                  str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                epsilon = 0
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_epsilon_greedy_action(state, epsilon)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(
                        env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ddqn_' + model_name + '_' +
                                     str(episode) + '.h5')
            agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                         str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ddqn_' + model_name + '_' +
                             str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                 str(end_episode) + '_optimizer.npy')

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             training_rewards)
    plt.title('Reward')
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Example #13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, network):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.network = network

        # Q-Network
        if self.network == "duel":
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, count):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, count)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, count):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Q values for best actions in next_state
        # from current Q network

        if self.network == "double" or "duel":
            Q_L = self.qnetwork_local(next_states).detach()
            _, actions_prime = Q_L.max(1)

        # get Q values from frozen network for next state and chosen action

        Q_targets_next = self.qnetwork_target(next_states).detach()
        Q_targets_next_s_a_prime = Q_targets_next.gather(
            1, actions_prime.unsqueeze(1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next_s_a_prime * (1 - dones))

        # Get expected Q values from target model using current actions
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #if count >= TARGET_UPDATE:
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #14
0
def train(env, env_eval, model, total_steps, view, criterion, optimizer,
          savedir, param_save_dir):
    try:
        os.mkdir("./params")
        print("Directory params Created")
    except FileExistsError:
        print("Directory params already exists")
    model_dir = "./params/{}".format(param_save_dir)
    try:
        os.mkdir(model_dir)
        print("Directory ", model_dir, " Created")
    except FileExistsError:
        print("Directory ", model_dir, " already exists")

    target_model = VanillaDQNCUDA(n_actions=env.action_space.n).to("cuda")
    memory = ReplayBuffer(MEM_SIZE)
    done = True
    episode = 0
    log_steps = 0
    rewards_history = []
    for step in range(1, total_steps + 1):
        try:
            if step % SAVE_FREQ == 0:
                save_model(model, step, savedir)
            if done:
                if episode > 0:
                    for i, experience in enumerate(trajectory):
                        obs, action, reward, next_obs, done = experience
                        memory.add(obs, action, reward, next_obs, done)
                    if log_steps >= LOG_EVERY:
                        log_steps = 0
                        episode_steps = step - episode_start_step
                        print("Episode: {} | Steps: {}/{} | Return: {}".format(
                            episode, episode_steps, step, episode_return))
                trajectory = []
                episode_start_step = step
                obs = np.array(env.reset())
                obs = obs.transpose((2, 0, 1))
                episode += 1
                episode_return = 0.0
                epsilon = update_epsilon(step)
            else:
                obs = next_obs
            action = agent_act(env, model, obs, epsilon)
            next_obs, reward, done, _ = env.step(action)
            next_obs = np.array(next_obs)
            next_obs = next_obs.transpose((2, 0, 1))
            episode_return += reward
            trajectory.append((obs, action, reward, next_obs, done))

            if step >= EXPLORE_STEPS and step % UPDATE_EVERY == 0:
                if step % TARGET_UPDATE_EVERY == 0:
                    target_model.load_state_dict(model.state_dict())
                batch = memory.sample(BATCH_SIZE)
                optimize(model,
                         target_model,
                         batch,
                         num_actions=env.action_space.n,
                         criterion=criterion,
                         optimizer=optimizer)
            if step >= EXPLORE_STEPS and step % EVAL_EVERY == 0:
                episode_return_avg = evaluate(env_eval, model, view=view)
                print("Episode: {} | Steps: {} | Evaluation Return Avg: {}".
                      format(
                          episode,
                          step,
                          episode_return_avg,
                      ))
                rewards_history.append(episode_return_avg)
            log_steps += 1
        except KeyboardInterrupt:
            del trajectory[:]
            del rewards_history[:]
            break
    pickle.dump([rewards_history],
                open(model_dir + '/' + "model_test_rewards.p", "wb+"))
    env.close()
    env_eval.close()
    torch.cuda.empty_cache()
    return rewards_history
Example #15
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Agent-Network
        ## TODO: Initialize your action network here
        "*** YOUR CODE HERE ***"
        self.network = AgentNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr=LR)
        self.network.train()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.0, get_prob=False):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        if get_prob:
            return action_values.cpu().data.numpy()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def discount_rewards(self, rewards, gamma=0.99):
        r = np.array([gamma**i * rewards[i] for i in range(len(rewards))])
        # Reverse the array direction for cumsum and then
        # revert back to the original order
        r = r[::-1].cumsum()[::-1]
        return r - r.mean()

    def learn(self, experiences, gamma=GAMMA):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        rewards = self.discount_rewards(rewards)

        ## TODO: compute and minimize the loss using REINFORCE
        "*** YOUR CODE HERE ***"
        self.optimizer.zero_grad()
        state_tensor = torch.FloatTensor(states)
        reward_tensor = torch.FloatTensor(rewards)
        action_tensor = torch.LongTensor(actions)

        # Calculate loss
        logprob = torch.log(self.network.forward(state_tensor))
        selected_logprobs = reward_tensor * logprob[
            np.arange(len(action_tensor)), action_tensor]
        loss = -selected_logprobs.mean()

        # Calculate gradients
        loss.backward()
        # Apply gradients
        self.optimizer.step()
Example #16
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, num_episodes, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            num_episodes (int): number of training epochs
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.anneal_beta = (1. - BETA) / num_episodes

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.t_learning_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def update_weights(self):
        self.memory.anneal_beta(self.anneal_beta)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, idxs, weights = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # update priorities
        updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy()
        self.memory.update_priorities(idxs, updates)

        # Compute loss
        loss = F.l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        (loss * weights).mean().backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.t_learning_step += 1
        if self.t_learning_step % UPDATE_TARGET_STEPS == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            # PyTorch copy: destination.data.copy(source.data)
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, apply_dueling=False, apply_double=False):
        """
        Initialize a Unity agent object.
        :param state_size: (int) dimension of each state
        :param action_size: (int) dimension of each action
        :param seed: (int) random seed
        """
        assert(self._true_xor(apply_dueling, apply_double),
               "Choose one between dueling networks or DDQN")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.apply_dueling = apply_dueling
        self.apply_double = apply_double

        # Q-Network
        self.q_net_target = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device)
        self.q_net_local = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device)
        self.opt = optim.Adam(self.q_net_local.parameters(), lr=LR)

        # Replay memory
        self.memory_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    @staticmethod
    def _true_xor(*args):
        return sum(args) == 1

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory buffer for future experience replay
        :param state: The current state of the agent
        :param action: The action that the agent has taken in given state
        :param reward: The reward associated with the state action combination
        :param next_state: The resulting state after taking action in previous state
        :param done: (bool) Has the terminal state been reached?
        :return: None
        """
        self.memory_buffer.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_CYCLE
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn from it
            if BATCH_SIZE < len(self.memory_buffer):
                experiences = self.memory_buffer.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """
        Returns actions for given state as per current policy.
        :param state: (array_like) current state
        :param eps: (float) epsilon, for epsilon-greedy action selection
        :return: (int) The index of the action to be taken by the agent
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_net_local.eval()
        with torch.no_grad():  # Do not perform a forward pass in this context
            action_values = self.q_net_local(state)
        self.q_net_local.train()

        # Epsilon-greedy action selection
        greed_p = random.random()

        return np.argmax(action_values.cpu().data.numpy()) if greed_p > eps else \
            random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples.
        :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples
        :param gamma: (float) discount factor
        :return:
        """
        states, actions, rewards, next_states, done_signals = experiences

        if not self.apply_double:
            # Get max predicted Q values for the next state of the target model.
            Q_targets_next = self.q_net_target(next_states).detach().max(1)[0].unsqueeze(1)
        else:
            # In the case of Double-DQN, evaluate the best selected action with the target model's set of parameters.
            indices = torch.argmax(self.q_net_local(next_states).detach(), 1)  # The selected next best action's indices
            # Evaluate that action by comparing with the local network's set of parameters
            Q_targets_next = self.q_net_target(next_states).detach().gather(1, indices.unsqueeze(1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - done_signals))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor which results from the concatenation of the input tensor values along
        # the given dimensions (here the dim indexes are the taken actions indices)
        Q_expected = self.q_net_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

        # perform network update
        self.soft_update(self.q_net_local, self.q_net_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters, given by the function:
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: (PyTorch model) weights will be copied from
        :param target_model: (PyTorch model) weights will be copied to
        :param tau: (float) interpolation parameter
        :return:
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

        print("agent initialized")

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)
        #if(np.random.randint(1000)==4):
        #print("epsilon",epsilon)
        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return

        #print("fetching minibatch from replay memory")
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                #INSERT YOUR CODE HERE
                # add experience from explore-exploit policy to memory
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, info = self.env.step(action)
                self.memory.add(state, action, reward, next_state, done)

                # update the model every 'update_steps' of experience
                self.update_batch()

                # update the target network (if the target network is being used) every 'model_replace_freq' of experiences
                if self.use_target_model and (self.steps %
                                              self.model_replace_freq == 0):
                    self.target_model.replace(self.eval_model)

                self.steps += 1
                steps += 1
                state = next_state

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
    def train(self):
        # initialize memory buffer
        buffer = ReplayBuffer(int(500000), self.batch_size, self.num_agents, 0)

        # use keep_awake to keep workspace from disconnecting
        for episode in range(self.number_of_episodes):
            env_info = self.env.reset(train_mode=True)[self.brain_name]

            agent_episode_rewards = [0, 0]

            for agent in self.maddpg.ddpg_agents:
                agent.noise.reset()

            for episode_t in range(self.max_episode_len):
                states = env_info.vector_observations
                states_t = to_tensor(states)

                with torch.no_grad():
                    action_ts = self.maddpg.act(states_t, noise=self.noise)
                    self.noise *= self.noise_reduction

                actions = torch.stack(action_ts).numpy()
                env_info = self.env.step(actions)[self.brain_name]

                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                for i in range(self.num_agents):
                    agent_episode_rewards[i] += rewards[i]

                full_state = np.concatenate(states)
                full_next_state = np.concatenate(next_states)

                buffer.add((states, full_state, actions, rewards, next_states, full_next_state, dones))

                # update once after every episode_per_update
                critic_losses = []
                actor_losses = []
                if len(buffer) > self.batch_size and episode % self.episode_per_update == 0:
                    for i in range(self.num_agents):
                        samples = buffer.sample()
                        cl, al = self.maddpg.update(samples, i)
                        critic_losses.append(cl)
                        actor_losses.append(al)
                    self.maddpg.update_targets()  # soft update the target network towards the actual networks

                if np.any(dones):
                    # if any of the agents are done break
                    break

            episode_reward = max(agent_episode_rewards)
            self.episode_rewards.append(episode_reward)
            self.last_100_episode_rewards.append(episode_reward)
            self.avg_rewards.append(np.mean(self.last_100_episode_rewards))
            # scores.append(episode_reward)
            print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format(episode, self.avg_rewards[-1],
                                                                              episode_reward),
                  end="")

            if episode % self.print_period == 0:
                print('\rEpisode {}\tAverage Score: {:.4f}'.format(episode, self.avg_rewards[-1]))

            # saving successful model
            # training ends when the threshold value is reached.
            if self.avg_rewards[-1] >= self.threshold:
                save_dict_list = []

                for i in range(self.num_agents):
                    save_dict = {'actor_params': self.maddpg.ddpg_agents[i].actor.state_dict(),
                                 'actor_optim_params': self.maddpg.ddpg_agents[i].actor_optimizer.state_dict(),
                                 'critic_params': self.maddpg.ddpg_agents[i].critic.state_dict(),
                                 'critic_optim_params': self.maddpg.ddpg_agents[i].critic_optimizer.state_dict()}
                    save_dict_list.append(save_dict)

                    torch.save(save_dict_list, self.ckpt)

                raw_score_plotter(self.episode_rewards)
                plotter('Tennis', len(self.episode_rewards), self.avg_rewards, self.threshold)
                break
Example #20
0
class DDPG_Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 actor_hidden=[400, 300],
                 critic_hidden=[400, 300],
                 id=0):
        super(DDPG_Agent, self).__init__()

        self.actor_local = Actor(state_size,
                                 action_size,
                                 random_seed,
                                 hidden_layer_param=actor_hidden).to(DEVICE)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  random_seed,
                                  hidden_layer_param=actor_hidden).to(DEVICE)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   random_seed,
                                   hidden_layer_param=critic_hidden).to(DEVICE)
        self.critic_target = Critic(
            state_size,
            action_size,
            random_seed,
            hidden_layer_param=critic_hidden).to(DEVICE)

        self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC)

        self.memory = ReplayBuffer(action_size, random_seed)

        self.seed = random.seed(random_seed)
        self.id = id
        print(critic_hidden)
        print("")
        print("--- Agent {} Params ---".format(self.id))
        print("Going to train on {}".format(DEVICE))
        print("Learning Rate:: Actor: {} | Critic: {}".format(
            LR_ACTOR, LR_CRITIC))
        print(
            "Replay Buffer:: Buffer Size: {} | Sampled Batch size: {}".format(
                BUFFER_SIZE, BATCH_SIZE))
        print("")
        print("Actor paramaters:: Input: {} | Hidden Layers: {} | Output: {}".
              format(state_size, actor_hidden, action_size))
        print("Critic paramaters:: Input: {} | Hidden Layers: {} | Output: {}".
              format(state_size,
                     [critic_hidden[0] + action_size, *critic_hidden[1:]], 1))
        print(self.actor_local)
        print(self.critic_local)
        print("")
        print("")

    # def act(self, state):
    #     state = torch.from_numpy(state).float().to(DEVICE)

    #     self.actor_local.eval()
    #     with torch.no_grad():
    #         actions = self.actor_local(state).cpu().data.numpy()
    #     self.actor_local.train()

    #     return actions

    def act(self, obs, noise=0.0):
        obs = obs.to(DEVICE)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(obs)  #+ noise*self.noise.noise()

        return action

    def step(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # ---                   Teach Critic (with TD)              --- #
        recommended_actions = self.actor_target(next_states)
        Q_nexts = self.critic_target(next_states, recommended_actions)
        Q_targets = (rewards + GAMMA * Q_nexts * (1 - dones)
                     )  # This is what we actually got from experience
        Q_expected = self.critic_local(
            states, actions
        )  # This is what we thought the expected return of that state-action is.
        critic_loss = CRITERION(Q_targets, Q_expected)

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # ---                   Teach Actor                          --- #
        next_actions = self.actor_local(states)
        # Here we get the value of each state-actions.
        # This will be backpropagated to the weights that produced the action in the actor network.
        # Large values will make weights stronger, smaller values (less expected return for that state-action) weaker
        actor_loss = -self.critic_local(states, next_actions).mean()

        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # Mix model parameters in both Actor and Critic #
        self.soft_update(self.actor_local, self.actor_target)
        self.soft_update(self.critic_local, self.critic_target)

    def soft_update(self, local, target):
        """Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            Params
            ======
                local_model: PyTorch model (weights will be copied from)
                target_model: PyTorch model (weights will be copied to)
                tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
class Actor():
    def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise,
    share_memory_flag, seed=0):
        self.state_size  = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.actor_lr = actor_lr
        self.weight_decay = weight_decay
        self.device = device
        self.seed= seed
        self.actor_loss =[]
        #self.critic_loss =[]
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.tau = tau
        self.noise= OUNoise(self.action_size,self.seed)
        #self.noise = noise
        self.share_memory_flag = share_memory_flag
        if self.share_memory_flag:
            self.memory = shared_memory
        else:
            self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device)

        ## Actor
        self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr)
        ## Critic
        #self.critic_local = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_target = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr,  weight_decay=self.weight_decay)
        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        #self.hard_update(self.critic_target, self.critic_local)

    def reset(self):
        self.noise.reset()

    def act(self, state,noise = True,sd=1e-4):
        state = torch.from_numpy(state).float().to(self.device)

        self.actor_local.eval()
        with torch.no_grad():
            #print(state.shape)
            action = self.actor_local(state).cpu().data.numpy()
            ##action.cpu().detach().numpy()
        self.actor_local.train()

        if noise:
            #print(type(action))
            #action += np.random.normal(loc=0.0, scale=sd, size=action.size)
            action += self.noise.sample()
        action = np.clip(action, -1,1).reshape(1,-1)
        return action




    def hard_update(self,target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

    def step(self, state, action, rewards, next_state, done,GAMMA=1.0):
        ## As per the description we are not supposed to use discount factor
        self.memory.add(state, action, rewards, next_state, done)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, number_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            number_agents (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.number_agents = number_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise processes
        self.noise = OUNoise((number_agents, action_size), random_seed)
        #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)
        #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experiences in replay memory, and use random sample from buffer to learn."""

        # We save experience tuples in the memory for each agent.
        for i in range(self.number_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings
        if len(self.memory) > BATCH_SIZE:
            for _ in range(UPDATE_RATE):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

#     def act(self, states, add_noise=True):
#         """Returns actions for given state as per current policy."""
#                                                                   # The code has been adapted to implement batch normalization.
#         actions = np.zeros((self.number_agents, self.action_size))
#         self.actor_local.eval()
#         with torch.no_grad():
#             for agent_number, state in enumerate(states):
#                 state = torch.from_numpy(state).float().unsqueeze(0).to(device)   # The code has been adapted to implement batch normalization.
#                 action = self.actor_local(state).cpu().data.numpy()
#                 actions[agent_number, :] = action
#         self.actor_local.train()
#         if add_noise:
#             actions += self.noise.sample()
#         return np.clip(actions, -1, 1)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.number_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_number, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_number, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 mnoise=True,
                 split_state=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.mnoise = mnoise
        self.split_state = split_state

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        if self.mnoise:
            self.noise = OUNoise((2, action_size), random_seed)
        else:
            self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.split_state:
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                self.memory.add(state, action, reward, next_state, done)
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Example #24
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 batch_size=128,
                 gamma=0.99,
                 mean_lambda=1e-3,
                 std_lambda=1e-3,
                 z_lambda=0.0):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size)

        self.mean_lambda = mean_lambda
        self.std_lambda = std_lambda
        self.z_lambda = z_lambda

        self.current_value = Value(state_size).to(device)
        self.target_value = Value(state_size).to(device)

        self.softQ = soft_Q(state_size, action_size)
        self.policy = Policy(state_size, action_size)

        self.value_optimizer = optim.Adam(self.current_value.parameters(),
                                          lr=3e-4)
        self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)

    def act(self, state):

        #state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.policy.act(state)

        if self.memory.__len__() > self.batch_size:
            self.update()

        return action

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

    def update(self):

        state, action, reward, next_state, done = self.memory.sample()

        expected_soft_q_value = self.softQ.forward(state, action)
        expected_value = self.current_value.forward(state)

        new_action, log_prob, z, mean, log_std = self.policy.evaluate(state)

        target_value = self.target_value.forward(next_state)
        next_soft_q_value = reward + self.gamma * target_value * (1 - done)

        q_val_mse = F.mse_loss(expected_soft_q_value,
                               next_soft_q_value.detach())

        expected_new_q_val = self.softQ.forward(state, new_action)
        next_value = expected_new_q_val - log_prob
        val_loss = F.mse_loss(expected_value, next_value.detach())

        log_prob_target = expected_new_q_val - expected_value
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        mean_loss = self.mean_lambda * mean.pow(2).mean()
        std_loss = self.std_lambda * log_std.pow(2).mean()
        z_loss = self.z_lambda * z.pow(2).sum(1).mean()

        policy_loss += mean_loss + std_loss + z_loss

        self.soft_q_optimizer.zero_grad()
        q_val_mse.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        val_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.soft_update(self.current_value, self.target_value, TAU)

    def soft_update(self, local_model, target_model, TRANSFER_RATE):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TRANSFER_RATE * local_param.data +
                                    (1.0 - TRANSFER_RATE) * target_param.data)
def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.n)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Store q values for illustration purposes
    q_max_array = []
    reward_array = []

    for i in range(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EP_STEPS):

            # if i % 40 == 0 and i > 1:
            #     env.render()

            # Begin "Experimentation and Evaluation Phase"

            # Seleect next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # If in a testing episode, do not add noise
            # if i%100 is not 49 and i%100 is not 99:
            noise = exploration_noise.noise()
            a = a + noise

            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)
            # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,)))

            # Add transition to replay buffer if not testing episode
            # if i%100 is not 49 and i%100 is not 99:
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    MINIBATCH_SIZE)

                # Find target estimate to use for updating the Q-function

                # Predict_traget function determines Q-value of next state
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Perform gradient descent to update critic
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:
                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))
                #reward_array.append(ep_reward)
                break

        ep_reward = 0
        s = env.reset()

        for j in range(MAX_EP_STEPS):
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)

            ep_reward += r
            s = s2

            if terminal:
                print('Normal | Reward: %.2i' % int(ep_reward), " | Episode",
                      i)
                reward_array.append(ep_reward)
                break

    # Max Q plot
    plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-')
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.savefig('Q.png')
    plt.show()

    # Reward plot
    plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-')
    plt.xlabel('Episode Number')
    plt.ylabel('Reward')
    plt.savefig('Reward.png')
    plt.show()
    save_result([[str(i[0]) for i in q_max_array],
                 [str(i) for i in reward_array]])
class Agent():
    def __init__(self,
                 state_space,
                 action_space,
                 memory_size=1000000,
                 batch_size=32,
                 seed=0,
                 q_size=51):

        self.state_space = state_space
        self.action_space = action_space
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.seed = seed
        self.q_size = q_size

        self.current_model = QDQN(self.state_space,
                                  self.action_space,
                                  n_quantiles=self.q_size).to(device)
        self.target_model = QDQN(self.state_space,
                                 self.action_space,
                                 n_quantiles=self.q_size).to(device)
        self.optimizer = Adam(self.current_model.parameters(), lr=LR)

        self.memory = ReplayBuffer(self.action_space, self.memory_size,
                                   self.batch_size, self.seed)
        self.update_every = 0

        self.tau = (torch.Tensor(
            (2 * np.arange(self.current_model.n_quantiles) + 1) /
            (2.0 * self.current_model.n_quantiles)).view(1, -1)).to(device)

    def soft_update(self, local_model, target_model, TRANSFER_RATE):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TRANSFER_RATE * local_param.data +
                                    (1.0 - TRANSFER_RATE) * target_param.data)

    def act(self, state, epsilon):

        if random.random() <= epsilon:
            action = random.choice(np.arange(self.action_space))
        else:
            action = self.current_model.act(state).cpu().numpy()
            #action = self.current_model.act(state, epsilon).cpu().numpy()
        return action

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.update_every += 1
        if self.update_every % UPDATE_FREQUENCY == 0:
            if len(self.memory) >= self.batch_size:
                experience = self.memory.sample()
                self.learn(experience, GAMMA)

    def learn(self, experience, gamma):

        sampled_state, sampled_action, sampled_reward, sampled_next_state, sampled_done = experience

        #print(self.current_model(sampled_state).shape)
        #print(self.current_model(sampled_state)[0:self.batch_size, 0: self.action_space])
        #print(self.current_model(sampled_state))
        #print(self.current_model(sampled_state).shape)

        #print(sampled_action.shape)
        #print(sampled_action.expand(self.batch_size, self.q_size))
        #print(sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size).shape)
        action = sampled_action.unsqueeze(1).expand(self.batch_size, 1,
                                                    self.q_size)

        #print(self.current_model(sampled_state))
        #print(self.current_model(sampled_state).gather(1, action).squeeze(1))

        theta = self.current_model(sampled_state).gather(1, action).squeeze(1)
        #theta = self.current_model(sampled_state).mean(2)

        z_next = self.target_model(sampled_next_state).detach()
        #print(z_next)
        #print(z_next.shape)

        z_next_max = z_next[np.arange(self.batch_size),
                            z_next.mean(2).max(1)[1]]
        #print(z_next_max)
        Ttheta = sampled_reward + GAMMA * (1 - sampled_done) * z_next_max
        #print(Ttheta)
        #print(Ttheta.shape)
        #print(theta.shape)
        diff = Ttheta.t().unsqueeze(-1) - theta

        loss = self.huber(diff) * (self.tau -
                                   (diff.detach() < 0).float()).abs()
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.current_model, self.target_model, TRANSFER_RATE)

    def huber(self, x, k=1.0):
        return torch.where(x.abs() < k, 0.5 * x.pow(2),
                           k * (x.abs() - 0.5 * k))
Example #27
0
class Agent():
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, step, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()

        if epsilon:
            noise = np.random.normal(0, 0.1, action.shape[0])
            action += noise

        return action

    def update(self, step):

        state, action, reward, next_state, done = self.memory.sample()

        next_state_action = self.target_actor(next_state)

        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
class Christophers_Agent():
    def __init__(self, task):
        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        self.actor = Actor(self.state_size, self.action_size, self.action_low,
                           self.action_high)
        self.critic = Critic(self.state_size, self.action_size)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = 0.95
        self.tau = 0.001

        self.best_w = None
        self.best_score = -np.inf

        self.exploration_mu = 0.5
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.4
        self.noise = Noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 32
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_score = -np.inf
        self.num_steps = 0

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        if self.get_score() > self.best_score:
            self.best_score = self.get_score()
        self.total_reward = 0.0
        self.num_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        self.num_steps += 1

        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor.model.predict(state)[0]
        action = list(action +
                      self.noise.sample())  # add some noise for exploration
        return action

    def get_score(self):
        return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        done = np.array([e.done for e in experiences
                         if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)

        self.critic.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(
            self.critic.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic.model, self.critic_target.model)
        self.soft_update(self.actor.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Example #29
0
def lunarworker(wid):
    import tensorflow as tf
    import numpy as np
    import gym
    import time
    import os

    from distagent import DistAgent
    from memory import ReplayBuffer
    from util import Linear, scale, RewMonitor, SkipEnv, StackEnv

    gpus = tf.config.experimental.get_visible_devices("GPU")

    # Select single gpu depending on wid
    total_gpus = 2
    gpu_nr = wid % total_gpus
    tf.config.set_visible_devices(gpus[gpu_nr], 'GPU')

    # Restricts mem to allow multiple tf sessions on one GPU
    tf.config.experimental.set_memory_growth(gpus[gpu_nr], True)

    # Train parameters
    N = int(8e6)
    eps = Linear(startval=0.1, endval=0.01, exploresteps=int(200e3))
    gamma = 0.99
    updatefreq = 4
    targetfreq = 1000
    savefreq = 80000

    # Setup
    env = gym.make("LunarLander-v2")
    env = RewMonitor(env)
    env = SkipEnv(env, skip=4)
    # env = StackEnv(env, n_frames=4)
    action_len = env.action_space.n
    agent = DistAgent(action_len,
                      dense=16,
                      supportsize=29,
                      vmin=-7.0,
                      vmax=7.0)
    mem = ReplayBuffer(size=int(20e3), batchsize=32)

    # Prefill
    tf.print("Collecting history...")
    prefill_end = int(10e3)
    state = env.reset()
    buff = []
    for t in range(1, prefill_end + 1):
        action = env.action_space.sample()
        endstate, rew, done, _ = env.step(action)
        data = (state, action, scale(rew), gamma, endstate, float(done))
        buff.append(data)
        if done:
            state = env.reset()
        else:
            state = endstate
        if t % 10000 == 0:
            tf.print(f"Collected {t} samples.")
    tf.print("Done.")

    tf.print("Storing history...")
    for data in buff:
        mem.add(data)
    tf.print("Done.")

    # Warm up
    states, _, _, _, _, _, = mem.sample()
    agent.probvalues(states)
    agent.t_probvalues(states)
    agent.update_target()

    # Initial dispatch
    tottime = time.time()

    # Training loop
    tf.print(f"Worker {wid} learning...")
    state = env.reset()
    episode_rewards = []
    buff = []
    for t in range(1, N + 1):
        t_eps = tf.constant(eps(t), dtype=tf.float32)
        action = agent.eps_greedy_action(
            state=np.reshape(state, [1, 8]).astype(np.float32),
            epsval=t_eps,
        )[0].numpy()
        endstate, rew, done, info = env.step(action)
        data = (state, action, scale(rew), gamma, endstate, float(done))
        buff.append(data)
        if info["Game Over"]:
            score = info["Episode Score"]
            episode_rewards.append(score)
            state = env.reset()
            if len(episode_rewards) % 100 == 0:
                tmptime = time.time()
                msit = (tmptime - tottime) / t * 1000
                ma100 = np.mean(episode_rewards[-111:-1])
                epstr = (f"Epsiode: {len(episode_rewards)}, " +
                         f"Step: {t}, " + f"MA100: {ma100}, " +
                         f"AvgSpeed: {msit:4.2f} ms/it")
                tf.print(epstr)
        else:
            state = endstate

        if t % updatefreq == 0:
            for data in buff:
                mem.add(data)
            buff = []
            (states, actions, drews, gexps, endstates, dones) = mem.sample()
            agent.train(states, actions, drews, gexps, endstates, dones)

        if t % targetfreq == 0:
            agent.update_target()

        if t % savefreq == 0:
            dir_str = f"lunarmodels/step{t}/"
            os.makedirs(dir_str, exist_ok=True)
            file_str = dir_str + "model-id-" + f"{wid}" + ".h5"
            agent.save(file_str)

    env.close()
    tmptime = time.time()
    tottime = tmptime - tottime
    msit = tottime / N * 1000
    tf.print(f"Learning done in {tottime:6.0f}s using {msit:4.2f} ms/it.")
    tf.print("Done.")
Example #30
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor.forward(state).cpu().data.numpy()
        self.actor.train()

        if epsilon:
            #if we want to inject some noise
            noise = np.random.normal(0, self.action_sigma, action.shape[0])
            action += noise

        return action

    def update(self, step):
        '''
        #https: // arxiv.org / pdf / 1802.09477.pdf
        the function is very similar to typical DDPG algorithm, except for
        1) we have 2 critics to update
        2) we take the min of the 2 values critics output
        3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper)
        4) We delay updating the actor by certain steps

        :param step: how often to update the actor
        :return:
        '''

        state, action, reward, next_state, done = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_state_action = self.target_actor(next_state)

        #sample a random noise
        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss

        #as mentioned in the paper, we delay updating the actor network.

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # ----------------------- update target networks ------------------- #
            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)