Example #1
0
class MADDPG:

    def __init__(self, state_size, action_size, num_agents, random_seed):


        super(MADDPG, self).__init__()

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)


        ##Create agents in the enviromnent

        self.agents = [ Agent(state_size, action_size, random_seed, num_agents) for i in range(num_agents)]

        ###create shared Memory Replay Buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states,noise):
        return [ agent.act(state, noise) for agent, state in zip(self.agents,states)]

    def step(self, states, actions, rewards, next_states, dones,num_current_episode):

        '''experience replay to save experiences in replay memory and use them to learn'''
        self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones)


        if (len(self.memory)>BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE==0):

            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                experiences = self.memory.sample()  #SAMPLE A BATCH OF EXP FROM MEMORY
                ###as of now maddpg_learn only works with 2 agents;
                ###modify it accept n number of agents
                '''Update agent 0 '''
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                experiences = self.memory.sample()
                '''update agent 1'''
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)

    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):

        states, actions, rewards, next_states, dones = experiences

        # Filter out the agent OWN states, actions and next_states batch
        own_states =  decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx, actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states)

        # Filter out the OTHER agent states, actions and next_states batch
        other_states =  decode(self.state_size, self.num_agents, other_idx, states)
        other_actions = decode(self.action_size, self.num_agents, other_idx, actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states)

        # Concatenate both agent information (own agent first, other agent in second position)
        all_states=torch.cat((own_states, other_states), dim=1).to(device)
        all_actions=torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device)

        agent = self.agents[own_idx]


        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)),
                                     dim =1).to(device)
        #print("all states, all actions" + str(all_next_states.shape) + " " + str(all_next_actions.shape) )
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)


        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()),
                                     dim = 1).to(device)
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()

        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)


    def maddpg_learn_old(self, experiences, own_idx, other_idx, gamma=GAMMA):
        '''Only works for 2 agents systems; modify it for any number of agents'''
        states, actions, rewards, next_states, dones = experiences

        ##filtering out own states
        own_states = decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size,self.num_agents, own_idx, actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states)

        ##filter out other agent states
        other_states = decode(self.state_size, self.num_agents, other_idx, states)
        other_actions = decode(self.action_size,self.num_agents, other_idx, actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states)

        ##conacat both agent info
        all_states = torch.cat((own_states, other_states), dim=1).to(device)
        all_actions = torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((own_next_states, other_next_states), dim=1).to(device)

        agent = self.agents[own_idx]


        ######update the critic#######
        '''Get predicted next state action and Q values from target models'''
        all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim=1).to(device)
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        Q_targets = rewards + (gamma*Q_targets_next*(1-dones))  #Q target for current state

        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        ##minimize the loss
        agent.critc_optimizer.zero_grad()
        critic_loss.backward()
        if(CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critc_optimizer.step()


        #####Update Actor########
        all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim=1).to(device)
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()

        ###minimize the loss####
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        #####UPDATE TARGET NETWORKS########
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)



    def checkpoints(self):
        """Save checkpoints for all Agents"""
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(idx) + '.pth'
            critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(idx) + '.pth'
            actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(idx) + '.pth'
            critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(idx) + '.pth'
            torch.save(agent.actor_local.state_dict(), actor_local_filename)
            torch.save(agent.critic_local.state_dict(), critic_local_filename)
            torch.save(agent.actor_target.state_dict(), actor_target_filename)
            torch.save(agent.critic_target.state_dict(), critic_target_filename)
Example #2
0
critic = Critic(
    sess, state_size=state_space, lr=LR_C
)  # we need a good teacher, so the teacher should learn faster than the actor

timestampe = datetime.datetime.now().strftime("%Y_%m_%d_%H%M")
writer = tf.summary.FileWriter("./logs/LabyrinthZone_Act1/%s" % timestampe,
                               sess.graph)
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()

max_t_interval = 100
scores = []  # list containing scores from each episode
scores_window = deque(maxlen=max_t_interval)  # last 100 scores

memory = ReplayBuffer(action_space, BUFFER_SIZE, BATCH_SIZE, 714)

state = None


def store_img(state, epoch, step):
    name = '../state_img/state_epoch_%i_%i.png' % (epoch, step)
    cv2.imwrite(name, state)


# RENDER = False
# eplison = 0.7
# decay = 0.95
# min_eplison = 0.05
max_mean_score = 2000
total_timestep = 0
class Maddpg():
    '''MADDPG Agent : Interacts with and learns from the environment'''
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Instantiate Multiple  Agent
        self.agents = [
            Agent(state_size, action_size, random_seed, num_agents)
            for i in range(num_agents)
        ]

        # Instantiate Memory replay Buffer (shared between agents)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def reset(self):
        '''reset agents'''
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise):
        '''Return action to perform for each agents (per policy)'''
        return [
            agent.act(state, noise)
            for agent, state in zip(self.agents, states)
        ]

    def step(self, states, actions, rewards, next_states, dones,
             num_current_episode):
        '''Save experience in replay memory, and use random sample from buffer to learn'''
        self.memory.add(encode(states), encode(actions), rewards,
                        encode(next_states), dones)

        # If enough samples in the replay memory and if it is time to update
        if (len(self.memory) > BATCH_SIZE) and (num_current_episode %
                                                UPDATE_EVERY_NB_EPISODE == 0):

            # Note: this code only expects 2 agents
            assert (len(self.agents) == 2)

            # Allow to learn several time in a row in the same episode
            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                # Sample a batch of experience from the replay buffer
                experiences = self.memory.sample()
                # Update Agent #0
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                # Sample another batch of experience from the replay buffer
                experiences = self.memory.sample()
                # Update Agent #1
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)

    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):
        states, actions, rewards, next_states, dones = experiences
        # Filter out the agent OWN states, actions and next_states batch
        own_states = decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx,
                             actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx,
                                 next_states)
        # Filter out the OTHER agent states, actions and next_states batch
        other_states = decode(self.state_size, self.num_agents, other_idx,
                              states)
        other_actions = decode(self.action_size, self.num_agents, other_idx,
                               actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx,
                                   next_states)
        # Concatenate both agent information (own agent first, other agent in second position)
        all_states = torch.cat((own_states, other_states), dim=1).to(device)
        all_actions = torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((own_next_states, other_next_states),
                                    dim=1).to(device)

        agent = self.agents[own_idx]

        # Update Critic
        # Get predicted next-state actions and Q values from target models
        all_next_actions = torch.cat(
            (agent.actor_target(own_states), agent.actor_target(other_states)),
            dim=1).to(device)
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # Update Actor
        # Compute actor loss
        all_actions_pred = torch.cat(
            (agent.actor_local(own_states),
             agent.actor_local(other_states).detach()),
            dim=1).to(device)
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # Update target networks
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)

    def checkpoints(self):
        '''Save checkpoints for all Agents'''
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(
                idx) + '.pth'
            critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(
                idx) + '.pth'
            actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(
                idx) + '.pth'
            critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(
                idx) + '.pth'
            torch.save(agent.actor_local.state_dict(), actor_local_filename)
            torch.save(agent.critic_local.state_dict(), critic_local_filename)
            torch.save(agent.actor_target.state_dict(), actor_target_filename)
            torch.save(agent.critic_target.state_dict(),
                       critic_target_filename)
def ddpg(agent_name, multiple_agents = False, PER = False, n_episodes = 300, max_t = 1000):
    """ Deep Deterministic Policy Gradients
    Params
    ======
        agent_name (string): agent name
        multiple_agents (boolean): boolean for multiple agents
        PER (boolean): 
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    env, env_info, states, state_size, action_size, brain_name, num_agents = initialize_env(multiple_agents)
    
    device = get_device()
    scores_window = deque(maxlen=100)
    scores = np.zeros(num_agents)
    scores_episode = []
    
    agents = [] 
    shared_memory = ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, RANDOM_SEED)
    for agent_id in range(num_agents):
        agents.append(Actor_Crtic_Agent(agent_name, agent_id, device, state_size, action_size))
    
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode = True)[brain_name]
        states = env_info.vector_observations
        
        for agent in agents:
            agent.reset()
            
        scores = np.zeros(num_agents)
            
        for t in range(max_t):      
            actions = np.array([agents[i].act(states[i]) for i in range(num_agents)])
            env_info = env.step(actions)[brain_name]       # send the action to the environment
            next_states = env_info.vector_observations     # get the next state
            rewards = env_info.rewards                     # get the reward
            dones = env_info.local_done        
            
            for i in range(num_agents):
                agents[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], shared_memory) 
            if shared_memory.batch_passed():
                # exit()
                experiences = shared_memory.sample()
                agents[0].learn(experiences, shared_memory)
                agents = share_learning(agents[0].actor_local, agents)
 
            states = next_states
            scores += rewards
            if t % 20:
                print('\rTimestep {}\tScore: {:.2f}\tmin: {:.2f}\tmax: {:.2f}'
                      .format(t, np.mean(scores), np.min(scores), np.max(scores)), end="") 
            if np.any(dones):
                break 
      
        score = np.mean(scores)
        scores_window.append(score)       # save most recent score
        scores_episode.append(score)

        print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}'.format(i_episode, score, np.mean(scores_window), np.max(scores)), end="\n")
        update_csv(agent_name, i_episode, np.mean(scores_window), np.max(scores))
        agents[0].save_agent(agent_name)

        # Early stop
        if i_episode == 100:
            return scores_episode

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            agents[0].save_agent(agent_name + "Complete")
            break
            
    return scores_episode
Example #5
0
class DQN:
    def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"):
        self.actions_count = 0
        self.n_actions = n_actions  # 总的动作个数
        self.device = device  # 设备,cpu或gpu等
        self.gamma = gamma
        # e-greedy策略相关参数
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = FCN(n_states, n_actions).to(self.device)
        self.target_net = FCN(n_states, n_actions).to(self.device)
        # target_net的初始模型参数完全复制policy_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        # 可查parameters()与state_dict()的区别,前者require_grad=True
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)

    def choose_action(self, state, train=True):
        '''选择动作
        '''
        if train:
            self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                math.exp(-1. * self.actions_count / self.epsilon_decay)
            self.actions_count += 1
            if random.random() > self.epsilon:
                with torch.no_grad():
                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                    state = torch.tensor(
                        [state], device=self.device, dtype=torch.float32)
                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    q_value = self.policy_net(state)
                    # tensor.max(1)返回每行的最大值以及对应的下标,
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                    action = q_value.max(1)[1].item()  
            else:
                action = random.randrange(self.n_actions)
            return action
        else: 
            with torch.no_grad():
                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                    state = torch.tensor(
                        [state], device='cpu', dtype=torch.float32)
                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    q_value = self.target_net(state)
                    # tensor.max(1)返回每行的最大值以及对应的下标,
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                    action = q_value.max(1)[1].item() 
            return action
    def update(self):

        if len(self.memory) < self.batch_size:
            return
        # 从memory中随机采样transition
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
        # 转为张量
        # 例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])
        state_batch = torch.tensor(
            state_batch, device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
            1)  # 例如tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device, dtype=torch.float)  # tensor([1., 1.,...,1])
        next_state_batch = torch.tensor(
            next_state_batch, device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(
            done_batch), device=self.device).unsqueeze(1)  # 将bool转为float然后转为张量

        # 计算当前(s_t,a)对应的Q(s_t, a)
        # 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]])
        # 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])
        q_predict = self.policy_net(state_batch).gather(
            dim=1, index=action_batch)  # 等价于self.forward
        # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数
        next_state_values = self.target_net(
            next_state_batch).max(1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
        # 计算 q_target
        # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
        q_target = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
        self.loss = nn.MSELoss()(q_predict, q_target.unsqueeze(1))  # 计算 均方误差loss
        # 优化模型
        self.optimizer.zero_grad()  # zero_grad清除上一步所有旧的gradients from the last step
        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
        self.loss.backward()
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()  # 更新模型

    def save_model(self,path):
        torch.save(self.target_net.state_dict(), path)

    def load_model(self,path):
        self.target_net.load_state_dict(torch.load(path))  
Example #6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, config):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = config.state_size
        self.action_size = config.action_size
        self.seed = random.seed(config.random_seed)
        self.config = config
        self.t_step = 0
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 config.random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  config.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   config.random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    config.random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config.lr_critic,
                                           weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(self.action_size, config.random_seed)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, config.buffer_size,
                                   config.batch_size, config.random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.config.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.config.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
def train(args, param):
    """

    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]
    torch.cuda.set_device(1)

    use_gym = False
    # in case seed experements
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    #args.repeat_opt = repeat_opt
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.env_name) + '-agent-' + str(args.policy)
    pathname += "_states_image_"
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = 'runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )

    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    max_action = 1
    args.max_episode_steps = 200

    if args.policy == "TD3_ad":
        policy = TD31v1(state_dim, action_dim, max_action, args)
    elif args.policy == "DDPG":
        policy = DDPG(state_dim, action_dim, max_action, args)

    file_name = "./pytorch_models/{}".format(args.env_name)
    replay_buffer = ReplayBuffer()
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file('search-' + pathname, text)
                #policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)

        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add((obs, new_obs, action, reward, done_bool))
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 1)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
Example #8
0
class MADDPGAgentGroup:
    """Group the MADDPG agents as a single entity"""
    def __init__(
            self,
            #  env,
            state_size,
            action_size,
            num_agents,
            writer,
            hparams,
            print_every=1000,
            result_dir='results'):
        self.num_agents = num_agents
        # self.env = env
        # self.brain_name = self.env.brain_names[0]
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = hparams.batch_size
        self.buffer_size = hparams.buffer_size
        self.seed = hparams.seed
        self.update_every = hparams.update_every
        random.seed(self.seed)
        self.writer = writer
        self.result_dir = result_dir

        self.hparams = hparams
        self.agents = [
            ma.MADDPGAgent(self.num_agents,
                           self.state_size,
                           self.action_size,
                           i,
                           self.writer,
                           self.hparams,
                           result_dir=self.result_dir)
            for i in range(self.num_agents)
        ]

        self.gamma = hparams.gamma

        self.memory = ReplayBuffer(
            self.buffer_size,
            self.batch_size,
            self.hparams.seed,
        )
        self.print_every = print_every
        self.learn_step = 0
        self.critic_loss = 0.0
        self.actor_loss = 0.0

    def act(self, states, add_noise=True):
        """Executes act on all the agents
        Parameters:
            states (list): list of states, one for each agent
            add_noise (bool): whether to apply noise to the actions
        """
        actions = []
        for i, agent in enumerate(self.agents):
            action = agent.act(states[i], add_noise)
            actions.append(action)
        return actions

    def reshape(self, states, actions, rewards, next_states, dones):
        """Reshape the inputs
        """
        # adding axis=0 to states, actions, and next_states
        states = np.expand_dims(states, axis=0)
        next_states = np.expand_dims(next_states, axis=0)
        assert (states.shape[0] == 1 and states.shape[1] == self.num_agents
                and states.shape[2] == self.state_size)

        actions = np.expand_dims(actions, axis=0)
        assert (actions.shape[0] == 1 and actions.shape[1] == self.num_agents
                and actions.shape[2] == self.action_size)

        # for rewards and dones, reshape then add axis=0
        rewards = np.expand_dims(np.array(rewards).reshape(
            self.num_agents, -1),
                                 axis=0)
        assert (rewards.shape[0] == 1 and rewards.shape[1] == self.num_agents
                and rewards.shape[2] == 1)
        dones = np.expand_dims(np.array(dones).reshape(self.num_agents, -1),
                               axis=0)

        return states, actions, rewards, next_states, dones

    def step(self, states, actions, rewards, next_states, dones):
        """Performs the learning step.
        """
        # store a single entry for results from all agents by adding axis=0
        states, actions, rewards, next_states, dones = self.reshape(
            states, actions, rewards, next_states, dones)
        self.memory.add(states, actions, rewards, next_states, dones)

        # Get agent to learn from experience if we have enough data/experiences in memory
        if len(
                self.memory
        ) > self.batch_size and self.learn_step % self.update_every == 0:

            experiences = self.memory.sample()
            actor_losses = []
            critic_losses = []

            for agent in self.agents:
                actor_loss, critic_loss = agent.learn(self.agents, experiences,
                                                      self.gamma)
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

            # Plot real-time graphs and store losses
            if self.learn_step % self.print_every == 0:
                # Save Critic loss
                utils.save_to_txt(
                    critic_losses,
                    '{}/critic_losses.txt'.format(self.result_dir))
                self.writer.text('critic loss: {}'.format(critic_losses),
                                 'Critic')
                self.writer.push(critic_losses, 'Loss(critic)')
                # Save Actor loss
                utils.save_to_txt(
                    actor_losses,
                    '{}/actor_losses.txt'.format(self.result_dir))
                self.writer.text('actor loss: {}'.format(actor_losses),
                                 'Actor')
                self.writer.push(actor_losses, 'Loss(actor)')

            self.critic_loss = np.array(critic_losses).mean()
            self.actor_loss = np.array(actor_losses).mean()
            self.learn_step += 1

        return self.critic_loss, self.actor_loss

    def reset(self):
        """Resets the noise for each agent"""
        for agent in self.agents:
            agent.reset()

    def save(self):
        """Checkpoint actor and critic models"""
        for agent in self.agents:
            agent.check_point()
Example #9
0
class DDPGAgent(object):
    """ class of the DDPG Agent """
    def __init__(self, config):
        """Initialize an Agent object.

        Args:
            param1: (config)
        """

        self.state_size = config.state_dim
        self.action_size = config.action_dim
        self.seed = np.random.seed(config.seed)
        self.n_agents = config.n_agents
        self.batch_size = config.batch_size
        self.tau = config.tau
        self.gamma = config.gamma
        self.device = config.device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config).to(config.device)
        self.actor_target = Actor(config).to(config.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config).to(config.device)
        self.critic_target = Critic(config).to(config.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config.lr_critic)

        # Noise process
        self.noise = OUNoise(config)

        # Replay memory
        self.memory = ReplayBuffer(config)
        #self.timesteps = 0

    def act(self, states, epsilon, add_noise=True):
        """ Given a list of states for each agent it returns the actions to be
        taken by each agent based on the current policy.
        Returns a numpy array of shape [n_agents, n_actions]
        NOTE: clips actions to be between -1, 1
        Args:
            states:    (torch) states
            epsilon: (float)
            add_noise: (bool) add noise to the actions
        """
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise and epsilon > np.random.random():
            actions += [self.noise.sample() for _ in range(self.n_agents)]
        return np.clip(actions, -1, 1)

    def reset_noise(self):
        """ reset noise"""
        self.noise.reset()

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        actor_target(state) -> action
        critic_target(state, action) -> Q-value
        """
        if self.batch_size > self.memory.size():
            return
        states, actions, rewards, next_states, dones = self.memory.sample()

        # ---------------------------- update critic ----------------------------

        # Get predicted next-state actions and Q values from target model

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.

        Args:
         param1: (torch network) local_model
         param2: (torch network) target_model
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
class Maddpg():
    """MADDPG Agent : Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize a MADDPG Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        
        super(Maddpg, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        
        # Instantiate Multiple  Agent
        self.agents = [ Agent(state_size,action_size, random_seed, num_agents) 
                       for i in range(num_agents) ]
        
        # Instantiate Memory replay Buffer (shared between agents)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
                  
    def reset(self):
        """Reset all the agents"""
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise):
        """Return action to perform for each agents (per policy)"""        
        return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ]
                
    
    def step(self, states, actions, rewards, next_states, dones, num_current_episode):
        """ # Save experience in replay memory, and use random sample from buffer to learn"""
 
        #self.memory.add(states, It mainly reuse function from ``actions, rewards, next_states, dones)
        self.memory.add(encode(states), 
                        encode(actions), 
                        rewards,
                        encode(next_states),
                        dones)

        # If enough samples in the replay memory and if it is time to update
        if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) :
            
            # Note: this code only expects 2 agents
            assert(len(self.agents)==2)
            
            # Allow to learn several time in a row in the same episode
            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                # Sample a batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #0
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                # Sample another batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #1
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)
                
    
    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):
        """
        Update the policy of the MADDPG "own" agent. The actors have only access to agent own 
        information, whereas the critics have access to all agents information.
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> action
            critic_target(all_states, all_actions) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            own_idx (int) : index of the own agent to update in self.agents
            other_idx (int) : index of the other agent to update in self.agents
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones = experiences
               
        # Filter out the agent OWN states, actions and next_states batch
        own_states =  decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx, actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) 
                
        # Filter out the OTHER agent states, actions and next_states batch
        other_states =  decode(self.state_size, self.num_agents, other_idx, states)
        other_actions = decode(self.action_size, self.num_agents, other_idx, actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states)
        
        # Concatenate both agent information (own agent first, other agent in second position)
        all_states=torch.cat((own_states, other_states), dim=1).to(device)
        all_actions=torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device)
   
        agent = self.agents[own_idx]
        
            
        # ---------------------------- Update Critic ---------------------------- #

        # Get predicted next-state actions and Q values from target models        
        all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)),
                                      dim =1).to(device) 
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- Update Actor ---------------------------- #

        # Compute actor loss
        all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()),
                                     dim = 1).to(device)      
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()
        
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()        
        agent.actor_optimizer.step()

        # ----------------------- Update Target Networks ----------------------- #
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)                   
    
    
                        
    def checkpoints(self):
        """Save checkpoints for all Agents"""
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(idx) + '.pth'
            critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(idx) + '.pth'           
            actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(idx) + '.pth'
            critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(idx) + '.pth'            
            torch.save(agent.actor_local.state_dict(), actor_local_filename) 
            torch.save(agent.critic_local.state_dict(), critic_local_filename)             
            torch.save(agent.actor_target.state_dict(), actor_target_filename) 
            torch.save(agent.critic_target.state_dict(), critic_target_filename)
Example #11
0
class DDPG:
    def __init__(self,
                 n_states,
                 n_actions,
                 hidden_dim=30,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128):
        self.device = device
        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.memory = ReplayBuffer(memory_capacity)
        self.batch_size = batch_size
        self.soft_tau = soft_tau
        self.gamma = gamma

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state)
        # torch.detach()用于切断反向传播
        return action.detach().cpu().numpy()[0, 0]

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size)
        # 将所有变量转为张量
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
        # 注意critic将(s_t,a)作为输入
        policy_loss = self.critic(state, self.actor(state))

        policy_loss = -policy_loss.mean()

        next_action = self.target_actor(next_state)
        target_value = self.target_critic(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * self.gamma * target_value
        expected_value = torch.clamp(expected_value, -np.inf, np.inf)

        value = self.critic(state, action)
        value_loss = nn.MSELoss()(value, expected_value.detach())

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()
        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)

    def save_model(self, path):
        torch.save(self.target_actor.state_dict(), path)

    def load_model(self, path):
        self.actor.load_state_dict(torch.load(path))
Example #12
0
def train(config_file_path: str, save_dir: str, use_vime: bool, random_policy: bool, device: str, visualize_interval: int):
    conf_d = toml.load(open(config_file_path))
    conf = namedtuple('Config', conf_d.keys())(*conf_d.values())

    # Check if saving directory is valid
    if "test" in save_dir and os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    if os.path.exists(save_dir):
        raise ValueError("Directory {} already exists.".format(save_dir))
    # Create save dir
    os.makedirs(save_dir)
    ckpt_dir = os.path.join(save_dir, 'checkpoints')
    os.makedirs(ckpt_dir)
    log_dir = os.path.join(save_dir, 'logs')
    os.makedirs(log_dir)
    # Save config file
    shutil.copyfile(config_file_path, os.path.join(save_dir, os.path.basename(config_file_path)))

    # Set random variable
    np.random.seed(int(time.time()))
    torch.manual_seed(int(time.time()))
    device = torch.device(device)
    if device.type == 'cuda':
        torch.cuda.manual_seed(int(time.time()))

    # Set up log metrics
    metrics = {
        'episode': [],
        'collected_samples': [],
        'reward': [], # cummulated reward
        'curiosity_reward': [], # cummulated reward with information gain
        'likelihood': [], # likelihood of leanred dynamics model
        'D_KL_median': [], 'D_KL_mean': [],
        'q1_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [],
        'ELBO': [],
        'step': [], 'step_reward': [],
        'test_episode': [], 'test_reward': [],
    }

    # Set up environment
    print("----------------------------------------\nTrain in {}\n----------------------------------------".format(conf.environment))
    env = gym.make(conf.environment)

    if use_vime:
        print("Use VIME")
    if random_policy:
        print("Keep using random policy.")

    # Training set up
    agent = SAC(env.observation_space, env.action_space, device, **conf.agent)
    memory = ReplayBuffer(conf.replay_buffer_capacity, env.observation_space.shape, env.action_space.shape)
    vime = VIME(env.observation_space.shape[0], env.action_space.shape[0], device, **conf.vime) if use_vime else None
    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        if use_vime:
            vime.load_state_dict(ckpt['vime'])

    def save_checkpoint():
        # Save checkpoint
        ckpt = {'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict()}
        if use_vime:
            ckpt['vime'] = vime.state_dict()
        path = os.path.join(ckpt_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(ckpt_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(ckpt_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    moving_avg_coef = 0.1
    agent_update_count = 0
    total_steps = 0

    for episode in pbar:
        o = env.reset()
        rewards, curiosity_rewards = [], []
        info_gains = []
        log_likelihoods = []
        q1_losses, q2_losses, policy_losses, alpha_losses, alphas = [],[],[],[],[]

        for t in range(conf.horizon):
            if len(memory) < conf.random_sample_num or random_policy:
                a = env.action_space.sample()
            else:
                a = agent.select_action(o, eval=False)

            o_next, r, done, _ = env.step(a)
            total_steps += 1
            metrics['step'].append(total_steps)
            metrics['step_reward'].append(r)
            done = False if t == env._max_episode_steps - 1 else bool(done)  # done should be False if an episode is terminated forcefully
            rewards.append(r)

            if use_vime and len(memory) >= conf.random_sample_num:
                # Calculate curiosity reward in VIME
                info_gain, log_likelihood = vime.calc_info_gain(o, a, o_next)
                assert not np.isnan(info_gain).any() and not np.isinf(info_gain).any(), "invalid information gain, {}".format(info_gains)
                info_gains.append(info_gain)
                log_likelihoods.append(log_likelihood)
                vime.memorize_episodic_info_gains(info_gain)            
                r = vime.calc_curiosity_reward(r, info_gain)
            curiosity_rewards.append(r)

            memory.append(o, a, r, o_next, done)
            o = o_next

            # Update agent
            if len(memory) >= conf.random_sample_num and not random_policy:
                for _ in range(conf.agent_update_per_step):
                    batch_data = memory.sample(conf.agent_update_batch_size)
                    q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(batch_data, agent_update_count)
                    q1_losses.append(q1_loss)
                    q2_losses.append(q2_loss)
                    policy_losses.append(policy_loss)
                    alpha_losses.append(alpha_loss)
                    alphas.append(alpha)
                    agent_update_count += 1

            if done:
                break

        if len(log_likelihoods) == 0:
            log_likelihoods.append(-np.inf)

        # Display performance
        episodic_reward = np.sum(rewards)
        reward_moving_avg = episodic_reward if reward_moving_avg is None else (1-moving_avg_coef) * reward_moving_avg + moving_avg_coef * episodic_reward
        if use_vime:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Curiosity {:.1f}, Rwd {:.1f} (m.avg {:.1f}), Likelihood {:.2E}".format(
                episode, memory.step, len(memory), len(rewards), np.sum(curiosity_rewards), episodic_reward, reward_moving_avg, np.mean(np.exp(log_likelihoods))))
        else:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Rwd {:.1f} (mov avg {:.1f})".format(
                episode, memory.step, len(memory), len(rewards), episodic_reward, reward_moving_avg))

        # Save episodic metrics
        metrics['episode'].append(episode)
        metrics['collected_samples'].append(total_steps)
        metrics['reward'].append(episodic_reward)
        metrics['curiosity_reward'].append(np.sum(curiosity_rewards))
        metrics['likelihood'].append(np.mean(np.exp(log_likelihoods)))
        if episode % visualize_interval == 0:
            lineplot(metrics['step'][-len(metrics['step_reward']):], metrics['step_reward'], 'stepwise_reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'reward', log_dir)
            lineplot(metrics['collected_samples'][-len(metrics['reward']):], metrics['reward'], 'sample-reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['curiosity_reward']):], metrics['curiosity_reward'], 'curiosity_reward', log_dir)
            lineplot(metrics['episode'][-len(metrics['likelihood']):], metrics['likelihood'], 'likelihood', log_dir)
        # Agent update related metrics
        if len(policy_losses) > 0 and not random_policy:
            metrics['q1_loss'].append(np.mean(q1_losses))
            metrics['policy_loss'].append(np.mean(policy_losses))
            metrics['alpha_loss'].append(np.mean(alpha_losses))
            metrics['alpha'].append(np.mean(alphas))
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q1_loss']):], metrics['q1_loss'], 'q1_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'policy_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'alpha_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'alpha', log_dir)

        # Update VIME
        if use_vime and len(memory) >= conf.random_sample_num:
            for _ in range(conf.vime_update_per_episode):
                batch_s, batch_a, _, batch_s_next, _ = memory.sample(conf.vime_update_batch_size)
                elbo = vime.update_posterior(batch_s, batch_a, batch_s_next)
            metrics['ELBO'].append(elbo)
            lineplot(metrics['episode'][-len(metrics['ELBO']):], metrics['ELBO'], 'ELBO', log_dir)
            if len(info_gains) > 0:
                metrics['D_KL_median'].append(np.median(info_gains))
                metrics['D_KL_mean'].append(np.mean(info_gains))
                multiple_lineplot(metrics['episode'][-len(metrics['D_KL_median']):], np.array([metrics['D_KL_median'], metrics['D_KL_mean']]).T, 'D_KL', ['median', 'mean'], log_dir)

        # Test current policy
        if episode % conf.test_interval == 0:
            rewards = []
            for _ in range(conf.test_times):
                o = env.reset()
                done = False
                episode_reward = 0
                while not done:
                    a = agent.select_action(o, eval=True)
                    o_next, r, done, _ = env.step(a)
                    episode_reward += r
                    o = o_next

                rewards.append(episode_reward)

            mean, std = np.mean(rewards), np.std(rewards)
            print("\nTEST AT EPISODE {} ({} episodes) --- Avg. Reward {:.2f} (+- {:.2f})".format(episode, conf.test_times, mean, std))

            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], 'test_reward', log_dir)
            

        # Save checkpoint
        if episode % conf.checkpoint_interval == 0:
            save_checkpoint()

    save_checkpoint()
    # Save the final model
    torch.save({'agent': agent.state_dict()}, os.path.join(ckpt_dir, 'final_model.pth'))
class Agent:
    def __init__(self) -> None:
        self.network = NetWork().to(device)
        print("Number of parameters in network:",
              count_parameters(self.network))
        self.criterion = MSELoss()
        self.optimizer = Adam(self.network.parameters(),
                              lr=0.001,
                              weight_decay=0.001)
        self.memory = ReplayBuffer(100000)
        self.remember = self.memory.remember()
        self.exploration = Exploration()
        self.explore = self.exploration.epsilonGreedy
        self.target_network = NetWork().to(device)
        self.placeholder_network = NetWork().to(device)

    def choose(self, pixels, hn, cn):
        self.network.hn, self.network.cn = hn, cn
        vals = self.network(pixels).reshape(15)
        return self.explore(
            vals), pixels, hn, cn, self.network.hn, self.network.cn

    def learn(self, double=False):
        gamma = 0.96
        obs, action, obs_next, reward, h0, c0, hn, sn, done = self.memory.sample_distribution(
            20)
        # self.network.hn, self.network.cn = hn, sn

        # if double:
        #     v_s_next = torch.gather(self.target_network(obs_next), 1, torch.argmax(self.network(obs_next), 1).view(-1, 1)).squeeze(1)
        # else:
        #     v_s_next, input_indexes = torch.max(self.target_network(obs_next), 1)

        # self.network.hn, self.network.cn = h0, c0
        # v_s = torch.gather(self.network(obs), 1, action)
        # #v_s, _ = torch.max(self.network(obs), 1)
        # td = (reward + gamma * v_s_next * done.type(torch.float)).detach().view(-1, 1)
        # loss = self.criterion(v_s, td)
        # loss.backward()
        # self.optimizer.step()
        # self.optimizer.zero_grad()
        # torch.cuda.empty_cache()
        self.autoEncode(obs)

    def autoEncode(self, obs):
        enc = self.network.color(obs)
        obs_Guess = self.network.colorReverse(enc)
        # print(enc.prod(1).sum())

        # print(f"[{str(float(enc_stand.max()))[:8]}]", end=" ")
        entro = (enc + 1).prod(1) - (1 + enc).max(1)[0]
        img = self.criterion(obs_Guess.view(20, -1), obs.view(20, -1) / 256)
        # print(enc.max(1)[0].max(1)[0].max(1)[0].shape)
        # print(enc.max(1, keepdim=True)[0].shape)
        maxi = enc.max(1, keepdim=False)[0]
        loss = img * 100 + (entro * entro).mean()
        loss.backward()

        self.optimizer.step()
        self.optimizer.zero_grad()
        print(f"[{str(float(loss))[:8]}]", end=" ")
        print(f"[{str(float(img*100))[:8]}]", end=" ")
        print(f"[{str(float((entro * entro).mean()))[:8]}]", end=" ")
        print(f"[{str(float(enc.min()))[:8]}]", end=" ")
        print(f"[{str(float(enc.max()))[:8]}]")
        # print(f"[{str(float(enc.mean()))[:8]}]")
        # print(f"[{str(float(entro.min()))[:8]}]", end=" ")
        # print(f"[{str(float(entro.max()))[:8]}]", end=" ")
        # print(f"[{str(float(enc.max()))[:8]}]")
        # print(*[[float(str(f)[:5]) for f in list(p.detach().cpu().numpy().reshape(-1))] for p in self.network.color.parameters()], *[[float(str(f)[:5]) for f in list(p.detach().cpu().numpy().reshape(-1))] for p in self.network.colorReverse.parameters()])
        torch.cuda.empty_cache()

    def update_target_network(self):
        self.target_network = copy.deepcopy(self.placeholder_network)
        self.placeholder_network = copy.deepcopy(self.network)
        self.memory.update_distribution()
Example #14
0
class DQNAgent:
    def __init__(self, settings):
        self.check_settings(settings)

        # Constants
        self.batch_size = settings["batch_size"]
        self.checkpoint_frequency = settings["checkpoint_frequency"]
        self.device = settings["device"]
        self.dtype = (torch.cuda.FloatTensor
                      if self.device.type == "cuda" else torch.FloatTensor)
        self.env_name = settings["env"]
        self.env = get_env(settings["env"], 6)
        self.eps_cliff = settings["eps_cliff"]
        self.eps_start = settings["eps_start"]
        self.eps_end = settings["eps_end"]
        self.frame_history_len = settings["frame_history_len"]
        self.gamma = settings["gamma"]
        self.learning_freq = settings["learning_freq"]
        self.learning_start = settings["learning_start"]
        self.logs_dir = settings["logs_dir"]
        self.log_freq = settings["log_freq"]
        self.memory_size = settings["memory_size"]
        self.model_name = settings["model_name"]
        self.num_actions = self.env.action_space.n
        settings["num_actions"] = self.num_actions
        settings["num_channels"] = self.frame_history_len
        self.out_dir = settings["out_dir"]
        self.target_update_freq = settings["target_update_freq"]
        self.total_timesteps = settings["total_timesteps"]

        # Init models
        self.Q = DQN(settings).to(self.device)
        self.target_Q = DQN(settings).to(self.device)
        self.target_Q.load_state_dict(self.Q.state_dict())
        self.target_Q.eval()

        # Init model supporting objects
        self.memory = ReplayBuffer(self.memory_size, self.frame_history_len)
        self.optimizer = optim.RMSprop(self.Q.parameters(),
                                       lr=settings["lr"],
                                       alpha=0.95,
                                       eps=0.01)
        self.loss = F.smooth_l1_loss

        # Logging
        self.writer = SummaryWriter(self.logs_dir)

    def check_settings(self, settings):
        required_settings = [
            "batch_size",
            "checkpoint_frequency",
            "device",
            "env",
            "eps_start",
            "eps_end",
            "eps_cliff",
            "frame_history_len",
            "gamma",
            "learning_start",
            "log_freq",
            "logs_dir",
            "lr",
            "memory_size",
            "model_name",
            "out_dir",
            "target_update_freq",
            "total_timesteps",
        ]

        if not settings_is_valid(settings, required_settings):
            raise Exception(
                f"Settings object {settings} missing some required settings.")

    def _get_epsilon(self, steps_done):
        if steps_done < self.eps_cliff:
            epsilon = (-(self.eps_start - self.eps_end) / self.eps_cliff *
                       steps_done + self.eps_start)
        else:
            epsilon = self.eps_end
        return epsilon

    def select_epsilon_greedy_action(self, state, steps_done, epsilon=None):
        if epsilon is None:
            threshold = self._get_epsilon(steps_done)
        else:
            threshold = epsilon
        if random.random() < threshold:
            return torch.IntTensor([random.randrange(self.num_actions)])
        obs = torch.from_numpy(state).type(self.dtype).unsqueeze(0) / 255.0
        with torch.no_grad():
            return self.Q(obs).argmax(dim=1).cpu()  # returns action

    def should_stop(self):
        return (get_wrapper_by_name(self.env, "Monitor").get_total_steps() >=
                self.max_steps)

    def eval_model(self, epoch, n=100):
        self.Q.eval()
        env = get_env(self.env_name, 6, monitor=False)
        rewards = []
        durations = []
        for _e in tqdm(range(n)):
            memory = ReplayBuffer(10000, self.frame_history_len)
            state = env.reset()[..., np.newaxis]
            reward_acc = 0.0
            for t in range(10000):
                if state is None:
                    break

                memory.store_frame(state)
                recent_observations = memory.encode_recent_observation()

                action = self.select_epsilon_greedy_action(
                    recent_observations, None, 0.05).item()
                state, reward, done, _ = env.step(action)

                if done:
                    state = env.reset()

                state = state[..., np.newaxis]
                reward_acc += reward

            durations.append(t)
        self.Q.train()
        sum_rewards = sum(rewards)
        sum_durations = sum(durations)
        self.writer.add_scalar(
            f"Mean Reward ({n} episodes)",
            round(sum_rewards / len(rewards), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Duration ({n} episodes)",
            round(sum_durations / len(durations), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Reward per Timestep ({n} episodes)",
            round(sum_rewards / sum_durations, 2),
            epoch,
        )

    def train(self):
        num_param_updates = 0
        loss_acc_since_last_log = 0.0
        param_updates_since_last_log = 0
        num_episodes = 0

        state = self.env.reset()[..., np.newaxis]
        for t in tqdm(range(self.total_timesteps)):
            last_idx = self.memory.store_frame(state)
            recent_observations = self.memory.encode_recent_observation()

            # Choose random action if learning hasn't started yet
            if t > self.learning_start:
                action = self.select_epsilon_greedy_action(
                    recent_observations, t).item()
            else:
                action = random.randrange(self.num_actions)

            # Advance a step
            next_state, reward, done, _ = self.env.step(action)
            next_state = next_state[..., np.newaxis]

            # Store result in memory
            self.memory.store_effect(last_idx, action, reward, done)

            # Reset if done (life lost, due to atari wrapper)
            if done:
                next_state = self.env.reset()
                next_state = next_state[..., np.newaxis]
            state = next_state

            # Train network using experience replay when
            # memory is sufficiently large.
            if (t > self.learning_start and t % self.learning_freq == 0
                    and self.memory.can_sample(self.batch_size)):
                # Sample from replay buffer
                (
                    state_batch,
                    act_batch,
                    r_batch,
                    next_state_batch,
                    done_mask,
                ) = self.memory.sample(self.batch_size)
                state_batch = torch.from_numpy(state_batch).type(
                    self.dtype) / 255.0
                act_batch = torch.from_numpy(act_batch).long().to(self.device)
                r_batch = torch.from_numpy(r_batch).to(self.device)
                next_state_batch = (
                    torch.from_numpy(next_state_batch).type(self.dtype) /
                    255.0)
                not_done_mask = torch.from_numpy(1 - done_mask).type(
                    self.dtype)

                # Calculate current Q value
                current_Q_vals = self.Q(state_batch).gather(
                    1, act_batch.unsqueeze(1))

                # Calculate next Q value based on action that gives max Q vals
                next_max_Q = self.target_Q(next_state_batch).detach().max(
                    dim=1)[0]
                next_Q_vals = not_done_mask * next_max_Q

                # Calculate target of current Q values
                target_Q_vals = r_batch + (self.gamma * next_Q_vals)

                # Calculate loss and backprop
                loss = F.smooth_l1_loss(current_Q_vals.squeeze(),
                                        target_Q_vals)
                self.optimizer.zero_grad()
                loss.backward()
                for param in self.Q.parameters():
                    param.grad.data.clamp_(-1, 1)

                # Update weights
                self.optimizer.step()
                num_param_updates += 1

                # Store stats
                loss_acc_since_last_log += loss.item()
                param_updates_since_last_log += 1

                # Update target network periodically
                if num_param_updates % self.target_update_freq == 0:
                    self.target_Q.load_state_dict(self.Q.state_dict())

                # Save model checkpoint
                if num_param_updates % self.checkpoint_frequency == 0:
                    save_model_checkpoint(
                        self.Q,
                        self.optimizer,
                        t,
                        f"{self.out_dir}/checkpoints/{self.model_name}_{num_param_updates}",
                    )

                # Log progress
                if (num_param_updates % (self.log_freq // 2) == 0
                        and param_updates_since_last_log > 0):
                    self.writer.add_scalar(
                        "Mean Loss per Update (Updates)",
                        loss_acc_since_last_log / param_updates_since_last_log,
                        num_param_updates,
                    )
                    loss_acc_since_last_log = 0.0
                    param_updates_since_last_log = 0

                if num_param_updates % self.log_freq == 0:
                    wrapper = get_wrapper_by_name(self.env, "Monitor")
                    episode_rewards = wrapper.get_episode_rewards()
                    mean_reward = round(np.mean(episode_rewards[-101:-1]), 2)
                    sum_reward = np.sum(episode_rewards[-101:-1])
                    episode_lengths = wrapper.get_episode_lengths()
                    mean_duration = round(np.mean(episode_lengths[-101:-1]), 2)
                    sum_duration = np.sum(episode_lengths[-101:-1])

                    self.writer.add_scalar(
                        f"Mean Reward (epoch = {self.log_freq} updates)",
                        mean_reward,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Duration (epoch = {self.log_freq} updates)",
                        mean_duration,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Reward per Timestep (epoch = {self.log_freq} updates)",
                        round(sum_reward / sum_duration, 2),
                        num_param_updates // self.log_freq,
                    )

            if done:
                num_episodes += 1

        # Save model
        save_model(self.Q, f"{self.out_dir}/{self.model_name}.model")

        self.env.close()

        print(f"Number of Episodes: {num_episodes}")

        return self.Q
Example #15
0
def train(args, param):
    """

    Args:
        param1(args): hyperparameter
    """

    # in case seed experements
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.env_name)
    if args.agent == "TD3_ad":
        pathname += '_update_freq_' + str(args.target_update_freq)
        pathname += "_num_q_target_" + str(args.num_q_target)
    pathname += "_seed_" + str(args.seed) + "_agent_" + args.agent
    tensorboard_name = args.locexp + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)
    env = gym.make(args.env_name)
    env.seed(args.seed)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    print(state_dim)
    if args.agent == "TD3_ad":
        policy = TD31v1(state_dim, action_dim, max_action, args)
    elif args.agent == "TD3":
        policy = TD3(state_dim, action_dim, max_action, args)
    replay_buffer = ReplayBuffer()
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    file_name = "%s_%s_%s" % (args.agent, args.env_name, str(args.seed))
    print("---------------------------------------")
    print("Settings: %s" % (file_name))
    print("---------------------------------------")
    # We start the main loop over 500,000 timesteps
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} Reward: {}  Average Re: {:.2f} Time: {}".format(
                    total_timesteps, episode_num, episode_reward,
                    np.mean(scores_window), time_format(time.time() - t0))
                print(text)
                write_into_file('search-' + pathname, text)
            # We evaluate the episode and we save the policy
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    episode_num))
            # When the training step is done, we reset the state of the environment
            obs = env.reset()
            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:  # After 10000 timesteps, we switch to the model
            action = policy.select_action(np.array(obs))
            # If the explore_noise parameter is not 0, we add noise to the action and we clip it
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        if args.agent == "TD3_ad":
            if total_timesteps % args.target_update_freq == 0:
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        # We check if the episode is done
        done_bool = 0 if episode_timesteps + 1 == 1000 else float(done)
        # We increase the total reward
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        replay_buffer.add((obs, new_obs, action, reward, done_bool))
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 1)

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
    if args.save_model:
        policy.save("%s" % (file_name), directory="./pytorch_models")
    np.save("./results/%s" % (file_name), evaluations)
Example #16
0
    def __init__(
        self,
        state_size,
        action_size,
        num_agents=2,
        actor_network_units=(64, 64),
        critic_network_units=(64, 64),
        optimizer_learning_rate_actor=1e-3,
        optimizer_learning_rate_critic=1e-3,
        optimizer_weight_decay_actor=0,
        optimizer_weight_decay_critic=0,
        noise_scale=0.1,
        noise_theta=0.2,
        noise_sigma=0.2,
        gamma=0.99,
        tau=1e-3,
        gradient_clip_actor=1.0,
        gradient_clip_critic=1.0,
        buffer_size=int(1e5),
        batch_size=128,
        update_every=1,
        device=None
    ):
        """Initializes a multi-agent training instance.

        :param state_size:  (int) Space size for state observations per agent
        :param action_size:  (int) Space size for actions per agent
        :param num_agents: (int) Number of agents used in problem
        :param actor_network_units:  (list of ints) Network topology for actor networks
        :param critic_network_units:  (list of ints) Network topology for critic networks
        :param optimizer_learning_rate_actor:  (float)  Learning rate for actor loss optimizer
        :param optimizer_learning_rate_critic:  (float)  Learning rate for critic loss optimizer
        :param optimizer_weight_decay_actor:  (float) Weight decay for actor loss optimizer
        :param optimizer_weight_decay_critic:  (float)  Weight decay for critic loss optimizer
        :param noise_scale:  (float)  Scale for noise process
        :param noise_theta:  (float)  Theta parameter for noise process
        :param noise_sigma:  (float)  Sigma parameter for noise process
        :param gamma:  (float)  Discount rate for rewards
        :param tau:  (float)  Update parameter for network soft updates
        :param gradient_clip_actor:  (float)  Gradient clipping parameter for actor loss optimizer
        :param gradient_clip_critic:  (float)  Gradient clipping parameter for critic loss optimizer
        :param buffer_size:  (int)  Size of replay memory buffer
        :param batch_size:  (int)  Size of training minibatches
        :param update_every:  (int)  Number of steps between training
        :param device:  (torch.device)  Object representing the device where to allocate tensors
        """
        if device is None:
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = device

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        self.gamma = gamma
        self.tau = tau
        self.gradient_clip_actor = gradient_clip_actor
        self.gradient_clip_critic = gradient_clip_critic

        self.update_every = update_every
        self.batch_size = batch_size
        self.t_step = 0
        self.episode = 0

        self.agents = []
        for i in range(num_agents):
            self.agents.append(DDPGAgent(
                state_size=state_size,
                action_size=action_size,
                actor_network_units=actor_network_units,
                critic_network_units=critic_network_units,
                num_agents=num_agents,
                optimizer_learning_rate_actor=optimizer_learning_rate_actor,
                optimizer_learning_rate_critic=optimizer_learning_rate_critic,
                actor_weight_decay=optimizer_weight_decay_actor,
                critic_weight_decay=optimizer_weight_decay_critic,
                noise_scale=noise_scale,
                noise_theta=noise_theta,
                noise_sigma=noise_sigma,
                device=device
            ))

        # Replay memory
        self.memory = ReplayBuffer(
            buffer_size=buffer_size,
            device=device
        )
Example #17
0
class DDQN_Agent(object):
    
    def __init__(self,env,input_dim,n_actions,alpha,gamma,epsilon,batch_size,lr=5e-4,
                 epsilon_dec=0.995,epsilon_end=0.05,memory_size=10000000,replace_target=5,
                 filename='ddqn.h5'):
        self.env = env
        self.action_space= np.arange(n_actions)
        self.input_dim = input_dim
        self.n_actions = n_actions
        self.alpha = alpha #learning rate
        self.gamma=gamma #discount factor 
        self.epsilon = epsilon #eps-greedy
        self.batch_size=batch_size
        self.epsilon_dec = epsilon_dec
        self.epsilon_end = epsilon_end
        self.filename = filename
        self.memory = ReplayBuffer(memory_size,input_dim)
        self.scores = [] # to keep track of scores
        self.avg_scores=[]
        self.replace_target = replace_target
        self.online_network=Neural_Network(lr,n_actions,input_dim) #network for evaluation
        self.target_network=Neural_Network(lr,n_actions,input_dim) #network for computing target
        # online and target network are the same except that parameters of target network
        # are copied each "replace target" steps from online network's parameters and kept
        # fixed on all other steps
        
        
    # to interface with memory
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    # choose epsilon greedy action (to keep exploration)
    def choose_action(self, state):
        state = state.reshape(1,-1)
        rand=np.random.random()
        if rand<self.epsilon:
            action=np.random.choice(self.action_space)
        else:
            actions=self.online_network.predict(state)
            action= np.argmax(actions)          
        return action 
    
    def update_online(self):#update parameters of the online network
        #we start learning after at least batch_size sample in memory
        if self.memory.memory_count< self.batch_size: 
            return   
        states, actions, rewards, new_states, done =self.memory.sample_buffer(self.batch_size)   
        q_target = self.online_network.predict(states)
        q_intermediate =  self.online_network.predict(new_states) # to estimate the action in the argmax
        q_next = self.target_network.predict(new_states) # to estimate the q value of the estimated action
        argmax_actions = np.argmax(q_intermediate,axis=1) # actions that maximize q value
        batch_index= np.arange(self.batch_size,dtype=np.int32)
        q_target[batch_index,actions] = rewards + self.gamma * q_next[batch_index,argmax_actions]*(1-done)
        # if episode over, 1-done = 0 , Q(terminal,)=0
        self.online_network.fit(states,q_target,verbose=0)
        self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon>self.epsilon_end else self.epsilon_end
        if self.memory.memory_count % self.replace_target ==0:
            self.update_target()
        
    def update_target(self): #update the parameters of target network from online network
        self.target_network.model.set_weights(self.online_network.model.get_weights())
        
        
    def train(self,n_games,path):
        # path : path where to save the model
        for i  in range(n_games):   
            score=0
            done = False
            state = self.env.reset()
            while not done:
                action = self.choose_action(state)
                new_state,reward,done,info= self.env.step(action)
                score+= reward
                self.remember(state, action, reward, new_state, done)
                state = new_state
                self.update_online()     
            self.scores.append(score)
            avg_score = np.mean(self.scores[max(0,i-50):i+1]) # rolling score : mean 
            self.avg_scores.append(avg_score)
            print('episode ',i,'score = %.2f'%score,' Rolling-score = %.2f'%avg_score)  
            # save the model after 100 games
            if i%100 ==0 and i>0:
                self.save_model(path)
            
    def save_model(self,path):
        self.online_network.save(path+'/'+ self.filename)
    
    
    def load_model(self,path):
        self.online_network= load_model(path)
Example #18
0
class MADDPG:

    def __init__(
        self,
        state_size,
        action_size,
        num_agents=2,
        actor_network_units=(64, 64),
        critic_network_units=(64, 64),
        optimizer_learning_rate_actor=1e-3,
        optimizer_learning_rate_critic=1e-3,
        optimizer_weight_decay_actor=0,
        optimizer_weight_decay_critic=0,
        noise_scale=0.1,
        noise_theta=0.2,
        noise_sigma=0.2,
        gamma=0.99,
        tau=1e-3,
        gradient_clip_actor=1.0,
        gradient_clip_critic=1.0,
        buffer_size=int(1e5),
        batch_size=128,
        update_every=1,
        device=None
    ):
        """Initializes a multi-agent training instance.

        :param state_size:  (int) Space size for state observations per agent
        :param action_size:  (int) Space size for actions per agent
        :param num_agents: (int) Number of agents used in problem
        :param actor_network_units:  (list of ints) Network topology for actor networks
        :param critic_network_units:  (list of ints) Network topology for critic networks
        :param optimizer_learning_rate_actor:  (float)  Learning rate for actor loss optimizer
        :param optimizer_learning_rate_critic:  (float)  Learning rate for critic loss optimizer
        :param optimizer_weight_decay_actor:  (float) Weight decay for actor loss optimizer
        :param optimizer_weight_decay_critic:  (float)  Weight decay for critic loss optimizer
        :param noise_scale:  (float)  Scale for noise process
        :param noise_theta:  (float)  Theta parameter for noise process
        :param noise_sigma:  (float)  Sigma parameter for noise process
        :param gamma:  (float)  Discount rate for rewards
        :param tau:  (float)  Update parameter for network soft updates
        :param gradient_clip_actor:  (float)  Gradient clipping parameter for actor loss optimizer
        :param gradient_clip_critic:  (float)  Gradient clipping parameter for critic loss optimizer
        :param buffer_size:  (int)  Size of replay memory buffer
        :param batch_size:  (int)  Size of training minibatches
        :param update_every:  (int)  Number of steps between training
        :param device:  (torch.device)  Object representing the device where to allocate tensors
        """
        if device is None:
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = device

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        self.gamma = gamma
        self.tau = tau
        self.gradient_clip_actor = gradient_clip_actor
        self.gradient_clip_critic = gradient_clip_critic

        self.update_every = update_every
        self.batch_size = batch_size
        self.t_step = 0
        self.episode = 0

        self.agents = []
        for i in range(num_agents):
            self.agents.append(DDPGAgent(
                state_size=state_size,
                action_size=action_size,
                actor_network_units=actor_network_units,
                critic_network_units=critic_network_units,
                num_agents=num_agents,
                optimizer_learning_rate_actor=optimizer_learning_rate_actor,
                optimizer_learning_rate_critic=optimizer_learning_rate_critic,
                actor_weight_decay=optimizer_weight_decay_actor,
                critic_weight_decay=optimizer_weight_decay_critic,
                noise_scale=noise_scale,
                noise_theta=noise_theta,
                noise_sigma=noise_sigma,
                device=device
            ))

        # Replay memory
        self.memory = ReplayBuffer(
            buffer_size=buffer_size,
            device=device
        )

    def step(self, state, action, reward, next_state, done):
        """ Store a single agent step, learning every N steps

         :param state: (array-like) Initial states on the visit
         :param action: (array-like) Actions on the visit
         :param reward: (array-like) Rewards received on the visit
         :param next_state:  (array-like) States reached after the visit
         :param done:  (array-like) Flag whether the next states are terminal states
         """

        self.memory.add(state, action, reward, next_state, done)

        # Learn every self.update_every time steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random batch and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample(self.batch_size)
                self.learn(experiences)

        # Keep track of episode number
        if np.any(done):
            self.episode += 1

    def act(self, states, target=False, noise=1.0):
        """ Returns the selected actions for the given states according to the current policy

        :param states: (array-like) Current states
        :param target:  (boolean, default False) Whether to use local networks or target networks
        :param noise:  (float, default 1)  Scaling parameter for noise process
        :return: action (array-like)  List of selected actions
        """

        if type(states) == np.ndarray:
            states = torch.from_numpy(states).float().to(self.device)

        actions = []
        with torch.no_grad():
            for i in range(self.num_agents):
                agent = self.agents[i]
                action = agent.act(states[i, :].view(1, -1), target=target, noise=noise)
                actions.append(action.squeeze())
        actions = torch.stack(actions)

        return actions.cpu().data.numpy()

    def learn(self, experiences):
        """ Performs training for each agent based on the selected set of experiencecs

        :param experiences:   Batch of experience tuples (s, a, r, s', d) collected from the replay buffer
        """

        state, action, rewards, next_state, done = experiences

        state = state.view(-1, self.num_agents, self.state_size)
        action = action.view(-1, self.num_agents, self.action_size)
        rewards = rewards.view(-1, self.num_agents)
        next_state = next_state.view(-1, self.num_agents, self.state_size)
        done = done.view(-1, self.num_agents)

        # Select agent being updated based on ensemble at time of samples
        for agent_number in range(self.num_agents):
            agent = self.agents[agent_number]

            # Compute the critic loss
            target_actions = []
            for i in range(self.num_agents):
                i_agent = self.agents[i]
                i_action = i_agent.act(next_state[:, i, :], target=True, noise=0.0, train=True)
                target_actions.append(i_action.squeeze())
            target_actions = torch.stack(target_actions)
            target_actions = target_actions.permute(1, 0, 2).contiguous()

            with torch.no_grad():
                flat_next_state = next_state.view(-1, self.num_agents * self.state_size)
                flat_target_actions = target_actions.view(-1, self.num_agents * self.action_size)
                Q_targets_next = agent.target_critic(flat_next_state, flat_target_actions).squeeze()

            Q_targets = rewards[:, agent_number] + self.gamma * Q_targets_next * (1 - done[:, agent_number])

            flat_state = state.view(-1, self.num_agents * self.state_size)
            flat_action = action.view(-1, self.num_agents * self.action_size)
            Q_expected = agent.critic(flat_state, flat_action).squeeze()

            critic_loss = F.mse_loss(Q_targets, Q_expected)

            # Minimize the critic loss
            agent.critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), self.gradient_clip_critic)
            agent.critic_optimizer.step()

            # Compute the actor loss
            Q_input = []
            for i in range(self.num_agents):
                i_agent = self.agents[i]
                Q_input.append(i_agent.actor(state[:, i, :]))
            Q_input = torch.stack(Q_input)
            Q_input = Q_input.permute(1, 0, 2).contiguous()
            flat_Q_input = Q_input.view(-1, self.num_agents * self.action_size)

            actor_loss = -agent.critic(flat_state, flat_Q_input).mean()

            # Minimize the actor loss
            agent.actor_optimizer.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), self.gradient_clip_actor)
            agent.actor_optimizer.step()

            # soft update target
            agent.soft_update(self.tau)

    def save(self, filename):
        """Saves the model networks to a file.

        :param filename:  Filename where to save the networks
        """
        checkpoint = {}

        for index, agent in enumerate(self.agents):
            checkpoint['actor_' + str(index)] = agent.actor.state_dict()
            checkpoint['target_actor_' + str(index)] = agent.target_actor.state_dict()
            checkpoint['critic_' + str(index)] = agent.critic.state_dict()
            checkpoint['target_critic_' + str(index)] = agent.target_critic.state_dict()

        torch.save(checkpoint, filename)

    def load(self, filename):
        """Loads the model networks from a file.

        :param filename: Filename from where to load the networks
        """
        checkpoint = torch.load(filename)

        for i in range(self.num_agents):
            agent = self.agents[i]

            agent.actor.load_state_dict(checkpoint['actor_' + str(i)])
            agent.target_actor.load_state_dict(checkpoint['target_actor_' + str(i)])
            agent.critic.load_state_dict(checkpoint['critic_' + str(i)])
            agent.target_critic.load_state_dict(checkpoint['target_critic_' + str(i)])
class DQNAgent():
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size=32,
                 eps_min=0.1,
                 eps_dec=1e-5,
                 tau=1000,
                 env_name='Doom',
                 chkpt_dir='models/'):
        self.action_to_game = [
            list(a) for a in itertools.product([0, 1], repeat=3)
        ]

        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.tau = tau
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(lr, n_actions, f'{env_name}_q_eval.pth',
                                   input_dims, chkpt_dir)
        self.q_next = DeepQNetwork(lr, n_actions, f'{env_name}_q_next.pth',
                                   input_dims, chkpt_dir).eval()

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            obs = observation.unsqueeze(0).to(self.q_eval.device)
            action = self.q_eval.forward(obs).argmax().item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, states_, done = self.memory.sample_buffer(
            self.batch_size)
        states = torch.tensor(state).to(self.q_eval.device)
        rewards = torch.tensor(reward).to(self.q_eval.device)
        dones = torch.tensor(done).to(self.q_eval.device)
        actions = torch.tensor(action).to(self.q_eval.device)
        states_new = torch.tensor(states_).to(self.q_eval.device)
        return states, actions, rewards, states_new, dones

    def update_target_network(self):
        if self.learn_step_counter % self.tau == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_eps(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.batch_size > self.memory.mem_cntr:
            return
        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(
            states
        )[indices,
          actions]  # select the values only for actions the agent have taken actions 0 or 1
        # q_next = self.q_eval.forward(states_).detach().max(dim=1)[0]  # predict the max value of the
        with torch.no_grad():
            q_next = self.q_next.forward(states_).detach().max(
                dim=1)[0]  # predict the max value of the
        q_next[
            dones] = 0.0  # for terminal states, there's no other state ahead, so reward = 0.
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        self.q_eval.optimizer.zero_grad()
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.update_target_network(
        )  # decide to update or not the weights of q_next
        self.decrement_eps()
Example #20
0
class DQN:
    def __init__(self,
                 screen_height=0,
                 screen_width=0,
                 n_actions=0,
                 gamma=0.999,
                 epsilon_start=0.9,
                 epsilon_end=0.05,
                 epsilon_decay=200,
                 memory_capacity=10000,
                 batch_size=128,
                 device="cpu"):
        self.actions_count = 0
        self.n_actions = n_actions  # 总的动作个数
        self.device = device  # 设备,cpu或gpu等
        self.gamma = gamma
        # e-greedy策略相关参数
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = CNN(screen_height, screen_width,
                              n_actions).to(self.device)
        self.target_net = CNN(screen_height, screen_width,
                              n_actions).to(self.device)
        self.target_net.load_state_dict(
            self.policy_net.state_dict())  # target_net的初始模型参数完全复制policy_net
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        self.optimizer = optim.RMSprop(self.policy_net.parameters(
        ))  # 可查parameters()与state_dict()的区别,前者require_grad=True
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)

    def select_action(self, state):
        '''选择动作
        Args:
            state [array]: [description]
        Returns:
            action [array]: [description]
        '''
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            math.exp(-1. * self.actions_count / self.epsilon_decay)
        self.actions_count += 1
        if random.random() > self.epsilon:
            with torch.no_grad():
                q_value = self.policy_net(
                    state)  # q_value比如tensor([[-0.2522,  0.3887]])
                # tensor.max(1)返回每行的最大值以及对应的下标,
                # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                action = q_value.max(1)[1].view(
                    1, 1)  # 注意这里action是个张量,如tensor([1])
                return action
        else:
            return torch.tensor([[random.randrange(self.n_actions)]],
                                device=self.device,
                                dtype=torch.long)

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = self.memory.Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)

        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)  # tensor([1., 1.,...,])

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)  #tensor([[ 1.1217],...,[ 0.8314]])

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=self.device)

        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Compute the expected Q values
        expected_state_action_values = (
            next_state_values *
            self.gamma) + reward_batch  # tensor([0.9685, 0.9683,...,])

        # Compute Huber loss
        self.loss = F.smooth_l1_loss(
            state_action_values,
            expected_state_action_values.unsqueeze(1))  # .unsqueeze增加一个维度
        # Optimize the model
        self.optimizer.zero_grad(
        )  # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).
        self.loss.backward(
        )  # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step(
        )  # causes the optimizer to take a step based on the gradients of the parameters.
def batch_ddpg(agent_name, multiple_agents = False, PER = False, n_episodes = 300, max_t = 1000):
    """ Batch processed the states in a single forward pass with a single neural network
    Params
    ======
        multiple_agents (boolean): boolean for multiple agents
        PER (boolean): 
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    env, env_info, states, state_size, action_size, brain_name, num_agents = initialize_env(multiple_agents)
    
    device = get_device()
    scores_window = deque(maxlen=100)
    scores = np.zeros(num_agents)
    scores_episode = []
    
    shared_memory = ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, RANDOM_SEED)
    agent = AC_Agent(brain_name, agent_name, device, state_size, action_size)

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode = True)[brain_name]
        states = env_info.vector_observations
        
        agent.reset()
 
        scores = np.zeros(num_agents)
            
        for t in range(max_t):      
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]       # send the action to the environment
            next_states = env_info.vector_observations     # get the next state
            rewards = env_info.rewards                     # get the reward
            dones = env_info.local_done        
            
            if multiple_agents:
                agent.step(states, actions, rewards, next_states, dones, shared_memory)
            else:
                agent.step(states, np.expand_dims(actions, axis=0), rewards, next_states, dones, shared_memory)

            if shared_memory.batch_passed():
                experiences = shared_memory.sample()
                agent.learn(experiences, shared_memory)
 
            states = next_states
            scores += rewards
            if t % 20:
                print('\rTimestep {}\tScore: {:.2f}\tmin: {:.2f}\tmax: {:.2f}'
                      .format(t, np.mean(scores), np.min(scores), np.max(scores)), end="") 
            if np.any(dones):
                break 

        score = np.mean(scores)
        scores_window.append(score)       # save most recent score
        scores_episode.append(score)

        print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}'.format(i_episode, score, np.mean(scores_window), np.max(scores)), end="\n")
        update_csv(agent_name, i_episode, np.mean(scores_window), np.max(scores))
        agent.save_agent(agent_name)

        # Early stop
        if i_episode == 100:
            return scores_episode

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            agent.save_agent(agent_name + "Complete")
            break
            
    return scores_episode
Example #22
0
def main():
  # define arguments
  parser = argparse.ArgumentParser()
  parser.add_argument("--render", action="store_true",
      help="Render the state")
  parser.add_argument("--render_interval", type=int, default=10,
      help="Number of rollouts to skip before rendering")
  parser.add_argument("--num_rollouts", type=int, default=-1,
      help="Number of max rollouts")
  parser.add_argument("--logfile", type=str,
      help="Indicate where to save rollout data")
  parser.add_argument("--load_params", type=str,
      help="Load previously learned parameters from [LOAD_PARAMS]")
  parser.add_argument("--save_params", type=str,
      help="Save learned parameters to [SAVE_PARAMS]")
  args = parser.parse_args()

  signal.signal(signal.SIGINT, stopsigCallback)
  global stopsig

  # create the basketball environment
  env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1,
      goal=[0, 5, 0],
      initialLengths=np.array([0, 0, 1, 1, 0, 1, 1]),
      initialAngles=np.array([-5, 45, -10, -10, 0, -10, 0]))

  # create space
  stateSpace = ContinuousSpace(ranges=env.state_range())
  actionRange = env.action_range()
  actionSpace = DiscreteSpace(intervals=[15 for i in range(5)] + [1],
      ranges=[actionRange[0],
              actionRange[1],
              actionRange[2],
              actionRange[3],
              actionRange[5],
              actionRange[7]])
  processor = JointProcessor(actionSpace)

  # create the model and policy functions
  modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 512, 256, 1],
      alpha=0.001, use_gpu=True)
  if args.load_params:
    print("loading params...")
    modelFn.load_params(args.load_params)

  softmax = lambda s: np.exp(s) / np.sum(np.exp(s))
  policyFn = EpsilonGreedyPolicy(epsilon=0.5,
      getActionsFn=lambda state: actionSpace.sample(1024),
      distributionFn=lambda qstate: softmax(modelFn(qstate)))
  dataset = ReplayBuffer()
  if args.logfile:
    log = open(args.logfile, "a")

  rollout = 0
  while args.num_rollouts == -1 or rollout < args.num_rollouts:
    print("Iteration:", rollout)
    state = env.reset()
    reward = 0
    done = False
    steps = 0
    while not done:
      if stopsig:
        break
      action = policyFn(state)
      nextState, reward, done, info = env.step(
          createAction(processor.process_env_action(action)))
      dataset.append(state, action, reward, nextState)
      state = nextState
      steps += 1
      if args.render and rollout % args.render_interval == 0:
        env.render()
    if stopsig:
      break

    dataset.reset() # push trajectory into the dataset buffer
    modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10)
    print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:",
        modelFn.score(), "with steps:", steps)
    if args.logfile:
      log.write("[" + str(rollout) + ", " + str(reward) + ", " +
          str(modelFn.score()) + "]\n")

    rollout += 1
    if rollout % 100 == 0:
      policyFn.epsilon *= 0.95
      print("Epsilon is now:", policyFn.epsilon)

  if args.logfile:
    log.close()
  if args.save_params:
    print("saving params...")
    modelFn.save_params(args.save_params)
Example #23
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Score tracker and learning parameters
        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None

    def reset_episode(self):
        #initialize the parameters
        self.total_reward = 0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.total_reward += reward

        self.count += 1

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.best_score < self.score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Example #24
0
 def test_ReplayBuffer(self):
     mem = ReplayBuffer(2)
     mem.push(1)
     mem.push(2)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [1, 2])
     mem.push(3)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [2, 3])
     mem.push(4)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [3, 4])
Example #25
0
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps

        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']

        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space

        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)

        self.memory = ReplayBuffer(hyper_params['memory_size'])

        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)

        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return
        # 1) Sample a 'batch_size' batch of experiences from the memory.
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values --- 2) Predict the Q-value from the 'eval_model' based on (states, actions)
        _, q_values = self.eval_model.predict_batch(states)
        q_values = q_values[batch_index, actions]

        # Calculate target --- 3) Predict the Q-value from the 'target model' based on (next_states), and take max of each Q-value vector, Q_max
        if self.use_target_model:
            actions, q_next = self.target_model.predict_batch(next_states)
        else:
            actions, q_next = self.eval_model.predict_batch(next_states)

        q_next = q_next[batch_index, actions]
        q_target = FloatTensor([
            reward[index] if is_terminal[index] else reward[index] +
            self.beta * q_next[index] for index in range(self.batch_size)
        ])

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:

                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, _ = self.env.step(action)
                # Store history
                self.memory.add(state, action, reward, next_state, done)
                # Update the model
                if self.steps % self.update_steps == 0:
                    self.update_batch()
                # Update the target network if DQN uses it
                if self.use_target_model:
                    if self.steps % self.model_replace_freq == 0:
                        self.target_model.replace(self.eval_model)
                # Update information for the next loop
                state = next_state
                steps += 1
                self.steps += 1

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
def main():
    seeding()
    # number of parallel agents

    env = UnityEnvironment(file_name="Tennis.x86_64")
    env_name = 'Tennis'

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    num_agents = len(env_info.agents)

    # size of each action
    action_size = brain.vector_action_space_size

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[-1]

    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 10000
    batchsize = 128

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1
    noise_reduction = 0.9999

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # initialize memory buffer
    buffer = ReplayBuffer(int(500000), batchsize, 0)

    # initialize policy and critic
    maddpg = MADDPG(state_size,
                    action_size,
                    num_agents,
                    seed=12345,
                    discount_factor=0.95,
                    tau=0.02)

    #how often to update the MADDPG model
    episode_per_update = 2
    # training loop

    PRINT_EVERY = 5
    scores_deque = deque(maxlen=100)

    # holds raw scores
    scores = []
    # holds avg scores of last 100 epsiodes
    avg_last_100 = []

    threshold = 0.5

    # use keep_awake to keep workspace from disconnecting
    for episode in range(number_of_episodes):

        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations  # get the current state (for each agent)
        episode_reward_agent0 = 0
        episode_reward_agent1 = 0

        for agent in maddpg.maddpg_agent:
            agent.noise.reset()

        for episode_t in range(episode_length):

            actions = maddpg.act(torch.tensor(state, dtype=torch.float),
                                 noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            env_info = env.step(actions_array)[brain_name]
            next_state = env_info.vector_observations

            reward = env_info.rewards
            done = env_info.local_done

            episode_reward_agent0 += reward[0]
            episode_reward_agent1 += reward[1]
            # add data to buffer
            '''
            I can either hstack or concat two states here or do it in the update function in MADDPG
            However I think it's easier to do it here, since in the update function I have batch_size to deal with
            Although the replay buffer would have to hold more data by preprocessing and creating 2 new variables that 
            hold essentially the same info as state, and next_state, but just concatenated.
            '''
            full_state = np.concatenate((state[0], state[1]))
            full_next_state = np.concatenate((next_state[0], next_state[1]))

            buffer.add(state, full_state, actions_array, reward, next_state,
                       full_next_state, done)

            state = next_state

            # update once after every episode_per_update
            if len(buffer) > batchsize and episode % episode_per_update == 0:
                for i in range(num_agents):
                    samples = buffer.sample()
                    maddpg.update(samples, i)
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks

            if np.any(done):
                #if any of the agents are done break
                break

        episode_reward = max(episode_reward_agent0, episode_reward_agent1)
        scores.append(episode_reward)
        scores_deque.append(episode_reward)
        avg_last_100.append(np.mean(scores_deque))
        # scores.append(episode_reward)
        print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format(
            episode, avg_last_100[-1], episode_reward),
              end="")

        if episode % PRINT_EVERY == 0:
            print('\rEpisode {}\tAverage Score: {:.4f}'.format(
                episode, avg_last_100[-1]))

        # saving successful model
        #training ends when the threshold value is reached.
        if avg_last_100[-1] >= threshold:
            save_dict_list = []

            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))
            # plots graphs
            raw_score_plotter(scores)
            plotter(env_name, len(scores), avg_last_100, threshold)
            break
Example #27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, config, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.device = config['device']

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.noise_epsilon = config['NOISE_EPSILON']

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config['LR_ACTOR'])
        self.hard_update(self.actor_local, self.actor_target)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config['LR_CRITIC'],
                                           weight_decay=config['WEIGHT_DECAY'])
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise((1, action_size), random_seed, 0.0,
                             config['OU_THETA'], config['OU_SIGMA'])
        self.noise_epsilon = config['NOISE_EPSILON']

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.config, random_seed)

    def step(self, t, state, action, reward, next_state, done, agent_index):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)

        if t % self.config['DDPG_UPDATE_EVERY'] == 0 and len(
                self.memory) > self.config['BATCH_SIZE']:
            for _ in range(self.config['DDPG_LEARN_TIMES']):
                experiences = self.memory.sample()
                self.learn(experiences, agent_index)

    def act(self, states):
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((1, self.action_size))

        self.actor_local.eval()

        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action

        self.actor_local.train()
        actions += self.noise_epsilon * self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, agent_index):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        gamma = self.config['GAMMA']
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        if agent_index == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        if agent_index == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        # ---------------------------- update noise ---------------------------- #
        self.noise_epsilon = max(
            self.noise_epsilon - self.config['NOISE_EPSILON_DECAY'],
            self.config['NOISE_EPSILON_MIN'])
        self.noise.reset()

    def hard_update(self, local_model, target_model):
        """Hard update model parameters.
        θ_target = θ_local
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        tau = self.config['TAU']
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class MADDPG:
    
    def __init__(self, num_agents, state_size, action_size, random_seed):
        
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        
        self.agents = [
            Agent(state_size, action_size, random_seed, i) 
            for i in range(num_agents)
        ]
        self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
       
    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise_counter):
        actions = []
        
        for agent, state in zip(self.agents, states):
            action = agent.act(state, noise_counter)
            actions.append(action)
        
        out = np.array(actions).reshape(1, -1)
        
        return out
            
    def step(self, states, actions, rewards, next_states, dones, t):
        
        states = states.reshape(1, -1)
        next_states = next_states.reshape(1, -1)

        # add to shared replay memory 
        self.memory.add(states, actions, rewards, next_states, dones)
        
        if t % LEARN_EVERY == 0:
            if len(self.memory) >= BATCH_SIZE:
                # use the same for all agents
                e = self.memory.sample()
                experiences = [e for _ in range(self.num_agents)]

                # each agent learns (loops over each agent in self.learn())
                self.learn(experiences, GAMMA)
                
    def learn(self, sample, gamma):
        
        next_actions = []
        actions = []
        
        # loop over each agent
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = sample[i]
            
            # get agent_id
            agent_id = torch.tensor([i]).to(device)
            
            # extract agent i state and get action via actor network
            state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1)
            action = agent.actor_local(state) # predict action
            actions.append(action)
            
            # extract agent i next state and get action via target actor network
            next_state = next_states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            next_actions.append(next_action)
        
        # let each agent learn from his experiences
        for i, agent in enumerate(self.agents):
            agent.learn(sample[i], GAMMA, actions, next_actions, i)
Example #29
0
class DDPG():
    def __init__(self,
                 env,
                 action_dim,
                 state_dim,
                 device,
                 critic_lr=3e-4,
                 actor_lr=3e-4,
                 gamma=0.99,
                 batch_size=100,
                 validate_steps=100,
                 max_episode_length=150):
        """
        param: env: An gym environment
        param: action_dim: Size of action space
        param: state_dim: Size of state space
        param: critic_lr: Learning rate of the critic
        param: actor_lr: Learning rate of the actor
        param: gamma: The discount factor
        param: batch_size: The batch size for training
        param: device: The device used for training
        param: validate_steps: Number of iterations after which we evaluate trained policy 
        """
        self.gamma = gamma
        self.batch_size = batch_size
        self.env = env
        self.device = device
        self.eval_env = deepcopy(env)
        self.validate_steps = validate_steps
        self.max_episode_length = max_episode_length

        # actor and actor_target where both networks have the same initial weights
        self.actor = Actor(state_dim=state_dim,
                           action_dim=action_dim).to(self.device)
        self.actor_target = deepcopy(self.actor)

        # critic and critic_target where both networks have the same initial weights
        self.critic = Critic(state_dim=state_dim,
                             action_dim=action_dim).to(self.device)
        self.critic_target = deepcopy(self.critic)

        # Optimizer for the actor and critic
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)

        # Replay buffer
        self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \
                                         action_dim=action_dim, env=env, device = device)

    def update_target_networks(self):
        """
        A function to update the target networks
        """
        weighSync(self.actor_target, self.actor)
        weighSync(self.critic_target, self.critic)

    def update_network(self, batch):
        """
        A function to update the function just once
        """

        # Sample and parse batch
        state, action, reward, state_next, done = self.ReplayBuffer.batch_sample(
            batch)

        # Predicting the next action and q_value
        action_next = self.actor_target(state_next)
        q_next = self.critic_target(state_next, action_next)
        target_q = reward + (self.gamma * done * q_next)

        q = self.critic(state, action)

        # Critic update
        self.critic.zero_grad()
        value_loss = F.mse_loss(q, target_q)
        value_loss.backward()
        self.optimizer_critic.step()

        # Actor update
        self.actor.zero_grad()
        policy_loss = -self.critic(state, self.actor(state)).mean()
        policy_loss.backward()
        self.optimizer_actor.step()

        # Target update
        self.update_target_networks()
        return value_loss.item(), policy_loss.item()

    def select_action(self, state, isEval):

        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        action = self.actor(state).squeeze(0).detach()
        if isEval:
            return action.cpu().numpy()
        action += torch.normal(0, 0.1, size=action.shape).to(self.device)
        action = torch.clamp(action, -1., 1.).cpu().numpy()
        return action

    def train(self, num_steps):
        """
        Train the policy for the given number of iterations
        :param num_steps:The number of steps to train the policy for
        """
        value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[]

        step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None

        while step < num_steps:
            # reset if it is the start of episode
            if state is None:
                state = deepcopy(self.env.reset())

            action = self.select_action(state, False)
            # env response with next_state, reward, terminate_info
            state_next, reward, done, _ = self.env.step(action)
            state_next = deepcopy(state_next)

            if self.max_episode_length and episode_steps >= self.max_episode_length - 1:
                done = True

            # observe and store in replay buffer
            self.ReplayBuffer.buffer_add(
                Exp(state=state,
                    action=action,
                    reward=reward,
                    state_next=state_next,
                    done=done))

            # update policy based on sampled batch
            batch = self.ReplayBuffer.buffer_sample(self.batch_size)
            value_loss, policy_loss = self.update_network(batch)
            value_losses.append(value_loss)
            policy_losses.append(policy_loss)

            # evaluate
            if step % self.validate_steps == 0:
                validate_reward, steps = self.evaluate()
                validation_reward.append(validate_reward)
                validation_steps.append(steps)
                print(
                    "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}"
                    .format(step, int(num_steps), steps, validate_reward))

            # update
            step += 1
            episode_steps += 1
            episode_reward += reward
            state = deepcopy(state_next)

            if done:  # reset at the end of episode
                #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward))
                episode_steps, episode_reward, state = 0, 0., None
                episode += 1

        return value_losses, policy_losses, validation_reward, validation_steps

    def evaluate(self):
        """
        Evaluate the policy trained so far in an evaluation environment
        """
        state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0

        while not done:
            action = self.select_action(state, True)
            state_next, reward, done, _ = self.eval_env.step(action)
            total_reward += reward
            steps += 1
            state = state_next
        return total_reward / steps, steps
Example #30
0
def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.shape[0])

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    totSteps = 0

    # Store q values for illustration purposes
    q_max_array = []

    actor.learning_rate = MAX_ACTOR_LEARNING_RATE
    critic.learning_rate = MAX_CRITIC_LEARNING_RATE

    for i in xrange(MAX_EPISODES):

        s = env.reset()
        s = normalize(s)

        ep_reward = 0
        ep_ave_max_q = 0

        # update learning rates using cosine annealing
        T_cur = i % LR_CYCLE
        actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\
                              0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\
                              0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        for j in xrange(MAX_EP_STEPS):

            totSteps += 1

            # Begin "Experimentation and Evaluation Phase"

            # Select next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim, 1)))

            # If in a testing episode, do not add noise
            if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99):
                noise = exploration_noise.noise()
                a = a + noise

            # Constrain action
            a = np.clip(a, -15, 15)

            # Take step with experimental action
            s2, r, terminal, info = env.step(
                np.reshape(a.T, newshape=(env.action_space.shape[0], )),
                CONST_THROTTLE)

            #print("car pos: " + str(env.car_dist_s))
            #print("action: " + str(a))
            #print("reward: " + str(r))

            s2 = normalize(s2)

            # Add transition to replay buffer if not testing episode
            if i % 100 is not 49 and i % 100 is not 99:
                replay_buffer.add(np.reshape(s, (actor.s_dim, 1)),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, 1)))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MEMORY_WARMUP:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)

                    # Find target estimate to use for updating the Q-function

                    # Predict_traget function determines Q-value of next state
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                    y_i = []
                    for k in xrange(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                    # Perform gradient descent to update critic
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                    ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                    # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:

                if i % 100 is 49 or i % 100 is 99:
                    print("Testing")

                    kmodel = Sequential()
                    actVars = []
                    for var in tf.trainable_variables():
                        if 'non-target' in str(var):
                            actVars.append(var)

                    kmodel.add(
                        Dense(units=l1size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[0]),
                                  sess.run(actVars[1])
                              ],
                              input_dim=actor.s_dim))
                    kmodel.add(
                        Dense(units=l2size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[2]),
                                  sess.run(actVars[3])
                              ]))
                    kmodel.add(
                        Dense(units=1,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[4]),
                                  sess.run(actVars[5])
                              ]))
                    optimizer = optimizers.RMSprop(lr=0.00025,
                                                   rho=0.9,
                                                   epsilon=1e-06)
                    kmodel.compile(loss="mse", optimizer=optimizer)
                    kmodel.save(modelfile)

                else:
                    print("Training")

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))

                print('Finished in ' + str(j) + ' steps')

                break

    plt.plot(q_max_array)
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.show()

    kmodel = Sequential()
    actVars = []
    for var in tf.trainable_variables():
        if 'non-target' in str(var):
            actVars.append(var)

    kmodel.add(
        Dense(units=l1size,
              activation='tanh',
              weights=[sess.run(actVars[0]),
                       sess.run(actVars[1])],
              input_dim=actor.s_dim))
    kmodel.add(
        Dense(units=l2size,
              activation='tanh',
              weights=[sess.run(actVars[2]),
                       sess.run(actVars[3])]))
    kmodel.add(
        Dense(units=1,
              activation='tanh',
              weights=[sess.run(actVars[4]),
                       sess.run(actVars[5])]))
    optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06)
    kmodel.compile(loss="mse", optimizer=optimizer)
    kmodel.summary()
    kmodel.save(modelfile)