def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) actor.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) episodes = 0 for iter in range(args.max_iter_num): actor.eval() steps = 0 scores = [] states = [] expert_actions = [] while steps < args.batch_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) states.append(state) expert_actions.append(expert_action) similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 similarity_scores.append(similarity_score) states = torch.stack(states) actions_pred , _ = actor(states) expert_actions = torch.stack(expert_actions) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train() loss = F.mse_loss(actions_pred,expert_action) actor_optim.zero_grad() actor_optim.step() # and this is basically all we need to do train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len) discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len) actor.to(device), critic.to(device), discrim.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] similarity_scores = [] while steps < args.total_sample_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) for _ in range(10000): steps += 1 mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. action = get_action(mu.cpu(), std.cpu())[0] for i in range(5): emb_sum = expert_action[i,:].sum().cpu().item() if emb_sum == 0: # print(i) action[i:,:] = 0 # manual padding break done= env.step(action) irl_reward = get_reward(discrim, state, action, args) if done: mask = 0 else: mask = 1 memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action]) score += irl_reward similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 scores.append(score) similarity_scores.append(similarity_score) score_avg = np.mean(scores) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg if args.suspend_accu_exp is not None: #only if not None do we check. if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
class DDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) np.random.seed(random_seed) # set the numpy seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # add OU noise for exploration self.noise = OUNoise(action_size, scale=1.0, sigma=.1) def reset(self): self.noise.reset() def step(self, states, actions, rewards, next_states, dones, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward (for each agent) for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and every 20 steps if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range( N_UPDATES): # generate n experiences and realize n updates experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, epsilon=0.0, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: # add a noise (based on normal distribution) to exploration actions += self.noise.noise() * epsilon return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.__update_critic_local(actions, dones, gamma, next_states, rewards, states) self.__update_actor_local(states) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def __update_critic_local(self, actions, dones, gamma, next_states, rewards, states): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def __update_actor_local(self, states): # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def network_summary(self): print('- Actor Summary (both local and target): ') self.actor_local.to(device).summary() print('- Critic Summary (both local and target): ') self.actor_local.to(device).summary() def save(self, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Save the actor and critic network weights""" torch.save(self.actor_local.state_dict(), path_result_folder(f'{checkpoint_actor_name}.pth')) torch.save(self.critic_local.state_dict(), path_result_folder(f'{checkpoint_critic_name}.pth')) @staticmethod def load(env: UnityEnvironment, random_seed=0, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Load the actor and critic network weights""" # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) action_size = brain.vector_action_space_size loaded_agent = DDPGAgent(state_size, action_size, random_seed) loaded_agent.actor_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_actor_name}.pth'))) loaded_agent.critic_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_critic_name}.pth'))) return loaded_agent
class Agent(): """ Interacts with and learn from the environment """ def __init__(self, state_size, action_size, random_seed, actor_layers, critic_layers): """ Initialize an Agent object. Params ====== state_size (int): size of the environment state action_size (int): size of the environment action random_seed (int): seed for the random actor_layers (array[int]): array containing the size of each layer of the actor network critic_layers (array[int]): array containing the size of each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed random.seed(random_seed) np.random.seed(random_seed) # Actor print(f'Agent running on {DEVICE}') self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic self.critic_local = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed) # Replay Buffer self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) def step(self, states, actions, rewards, next_states, dones, time_step): """ Save experience in replay memory, and use random sample from buffer to learn """ for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn only if there is enough samples on memory if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range(N_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True, epsilon=1.0): """ Returns actions for given state as per current policy """ state = torch.from_numpy(state).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # actions += self.noise.sample() actions += np.random.normal(0, .3) * epsilon return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Critic update actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor update actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update weights self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will copied from target_model (PyTorch model): weights will copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)