def reset(self): #Creation of the batchers model = copy.deepcopy(self.learning_model) self.train_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_envs"] * self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{ k: self.config[k] for k in self.config if k.startswith("environment/") } }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_threads"]) ], ) #Creation of the evaluation batcher model = copy.deepcopy(self.learning_model) self.evaluation_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_rollouts"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{ k: self.config[k] for k in self.config if k.startswith("environment/") } }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_evaluation_threads"], seeds=self.config["env_seed"] * 10, ) self.register_batcher(self.train_batcher) self.register_batcher(self.evaluation_batcher)
def reset(self): self.q1 = self._create_q() self.q2 = self._create_q() self.target_q1=self._create_q() self.target_q2=self._create_q() self.target_q1.load_state_dict(self.q1.state_dict()) self.target_q2.load_state_dict(self.q2.state_dict()) model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["batch_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"train", "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"action_dim": self.action_dim, "policy": model}, n_threads=self.config["n_threads"], seeds=self.config["env_seed"], ) model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_rollouts"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"evaluation", "max_episode_steps": self.config["max_episode_steps"], "n_envs": self.config["n_envs"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"action_dim": self.action_dim, "policy": model}, n_threads=self.config["n_evaluation_threads"], seeds=self.config["env_seed"]*10, ) self.register_batcher(self.train_batcher) self.register_batcher(self.evaluation_batcher)
def run(self): # Instantiate the learning model abd the baseline model self.learning_model=AgentModel(self.obs_dim,self.n_actions,32) self.critic_model=BaselineModel(self.obs_dim,32) #We create a batcher dedicated to evaluation model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_evaluation_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])], ) #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["a2c_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_train_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time=time.time() self.iteration=0 # #We launch the evaluation batcher (in deterministic mode) n_episodes=self.config["n_evaluation_episodes"] agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info) self.evaluation_iteration=self.iteration #Initialize the training batcher such that agents will start to acqire pieces of episodes self.train_batcher.update(self.learning_model.state_dict()) n_episodes=self.config["n_envs"]*self.config["n_threads"] agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.reset(agent_info=agent_info) while(time.time()-_start_time<self.config["time_limit"]): #Call the batcher to get a sample of trajectories #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode self.train_batcher.execute() trajectories=self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt=self.get_loss(trajectories) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] # Computation of final loss ld = self.config["critic_coef"] * dt["critic_loss"] lr = self.config["a2c_coef"] * dt["a2c_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr floss= floss/n_episodes*trajectories.n_elems() optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) self.iteration+=1 #We check the evaluation batcher evaluation_trajectories=self.evaluation_batcher.get(blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration) print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item())) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update(self.learning_model.state_dict()) self.evaluation_iteration=self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close()
class A2C: def __init__(self, config,create_env,create_train_env,create_agent): self.config = config # Creation of the Logger (that saves in tensorboard and CSV) self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config) self._create_env=create_env self._create_train_env=create_train_env self._create_agent=create_agent #Creation of one env instance to get the dimensionnality of observations and number of actions env = self._create_env(self.config["n_envs"], seed=0,env_name=self.config["env_name"]) self.n_actions = env.action_space.n self.obs_dim = env.reset()[0]["frame"].size()[1] del env def run(self): # Instantiate the learning model abd the baseline model self.learning_model=AgentModel(self.obs_dim,self.n_actions,32) self.critic_model=BaselineModel(self.obs_dim,32) #We create a batcher dedicated to evaluation model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_evaluation_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])], ) #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["a2c_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_train_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time=time.time() self.iteration=0 # #We launch the evaluation batcher (in deterministic mode) n_episodes=self.config["n_evaluation_episodes"] agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info) self.evaluation_iteration=self.iteration #Initialize the training batcher such that agents will start to acqire pieces of episodes self.train_batcher.update(self.learning_model.state_dict()) n_episodes=self.config["n_envs"]*self.config["n_threads"] agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.reset(agent_info=agent_info) while(time.time()-_start_time<self.config["time_limit"]): #Call the batcher to get a sample of trajectories #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode self.train_batcher.execute() trajectories=self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt=self.get_loss(trajectories) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] # Computation of final loss ld = self.config["critic_coef"] * dt["critic_loss"] lr = self.config["a2c_coef"] * dt["a2c_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr floss= floss/n_episodes*trajectories.n_elems() optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) self.iteration+=1 #We check the evaluation batcher evaluation_trajectories=self.evaluation_batcher.get(blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration) print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item())) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update(self.learning_model.state_dict()) self.evaluation_iteration=self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close() def get_loss(self,trajectories): #First, we want to compute the cumulated reward per trajectory #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory # The 'reward' field corresopnds to the reward at time t reward=trajectories["_reward"] #We get the mask that tells which transition is in a trajectory (1) or not (0) mask=trajectories.mask() #We remove the reward values that are not in the trajectories reward=reward*mask max_length=trajectories.lengths.max().item() #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward' action_probabilities=[] for t in range(max_length): proba=self.learning_model(trajectories["frame"][:,t]) action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension) action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor #We compute the critic value for t=0 to T (i.e including the very last observation) critic=[] for t in range(max_length): b=self.critic_model(trajectories["frame"][:,t]) critic.append(b.unsqueeze(1)) critic=torch.cat(critic+[b.unsqueeze(1)],dim=1).squeeze(-1) #Now, we have a B x (T+1) tensor #We also need to compute the critic value at for the last observation of the trajectories (to compute the TD) # It may be the last element of the trajectories (if episode is not finished), or on the last frame of the episode idx=torch.arange(trajectories.n_elems()) last_critic=self.critic_model(trajectories["_frame"][idx,trajectories.lengths-1]).squeeze(-1) critic[idx,trajectories.lengths]=last_critic #We compute the temporal difference target=reward+self.config["discount_factor"]*(1-trajectories["_done"].float())*critic[:,1:].detach() td=critic[:,:-1]-target critic_loss=td**2 #We sum the loss for each episode (considering the mask) critic_loss= (critic_loss*mask).sum(1)/mask.sum(1) #We average the loss over all the trajectories avg_critic_loss = critic_loss.mean() #We do the same on the reinforce loss action_distribution=torch.distributions.Categorical(action_probabilities) log_proba=action_distribution.log_prob(trajectories["action"]) a2c_loss = -log_proba * td.detach() a2c_loss = (a2c_loss*mask).sum(1)/mask.sum(1) avg_a2c_loss=a2c_loss.mean() #We compute the entropy loss entropy=action_distribution.entropy() entropy=(entropy*mask).sum(1)/mask.sum(1) avg_entropy=entropy.mean() return DictTensor({"critic_loss":avg_critic_loss,"a2c_loss":avg_a2c_loss,"entropy_loss":avg_entropy})
class SAC(BaseExperiment): def __init__(self, config, create_env, create_agent): super().__init__(config,create_env,create_agent) env = self._create_env( self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")} ) self.action_dim = env.action_space.sample().shape[0] self.obs_dim = env.reset()[0]["frame"].size()[1] del env def check_arguments(self,args): assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0 assert args["evaluation_mode"]=="deterministic" or args["evaluation_mode"]=="stochastic" return True def save(self): super().save() reward=self.evaluate(relaunch=False) while(reward is None): reward=self.evaluate(relaunch=False) f=open(self.config["logdir"]+"/out.out","wb") pickle.dump({"reward":reward},f) time.sleep(np.random.rand()*10) f.close() def reset(self): self.q1 = self._create_q() self.q2 = self._create_q() self.target_q1=self._create_q() self.target_q2=self._create_q() self.target_q1.load_state_dict(self.q1.state_dict()) self.target_q2.load_state_dict(self.q2.state_dict()) model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["batch_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"train", "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"action_dim": self.action_dim, "policy": model}, n_threads=self.config["n_threads"], seeds=self.config["env_seed"], ) model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_rollouts"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"evaluation", "max_episode_steps": self.config["max_episode_steps"], "n_envs": self.config["n_envs"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"action_dim": self.action_dim, "policy": model}, n_threads=self.config["n_evaluation_threads"], seeds=self.config["env_seed"]*10, ) self.register_batcher(self.train_batcher) self.register_batcher(self.evaluation_batcher) def _state_dict(self,model,device): sd = model.state_dict() for k, v in sd.items(): sd[k] = v.to(device) return sd def soft_update_params(self,net, target_net, tau): for param, target_param in zip(net.parameters(), target_net.parameters()): target_param.data.copy_(tau * param.data +(1 - tau) * target_param.data) def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.q1.to(device) self.q2.to(device) self.target_q1.to(device) self.target_q2.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) optimizer_q1 = torch.optim.Adam( self.q1.parameters(), lr=self.config["lr"] ) optimizer_q2 = torch.optim.Adam( self.q2.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)})) logging.info("Sampling initial transitions") n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"])) for k in range(n_iterations): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) print("replay_buffer_size = ",self.replay_buffer.size()) n_episodes=self.config["n_evaluation_rollouts"] stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes) self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["n_batches_per_epochs"]): transitions=self.replay_buffer.sample(n=self.config["size_batches"]) #print(dt) dt,transitions = self.get_q_loss(transitions,device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] optimizer_q1.zero_grad() dt["q1_loss"].backward() optimizer_q1.step() optimizer_q2.zero_grad() dt["q2_loss"].backward() optimizer_q2.step() optimizer.zero_grad() dt = self.get_policy_loss(transitions) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] dt["policy_loss"].backward() optimizer.step() tau=self.config["tau"] self.soft_update_params(self.q1,self.target_q1,tau) self.soft_update_params(self.q2,self.target_q2,tau) self.iteration+=1 self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate() def evaluate(self,relaunch=True): evaluation_trajectories = self.evaluation_batcher.get(blocking=False) if (evaluation_trajectories is None): return avg_reward = ( ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask() ) .sum(1) .mean() .item() ) self.logger.add_scalar("avg_reward/"+self.config["evaluation_mode"], avg_reward, self.iteration) if (self.config["verbose"]): print("Iteration "+str(self.iteration)+", Reward = "+str(avg_reward)) if (relaunch): cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.evaluation_batcher.update(cpu_parameters) self.evaluation_batcher.reexecute() return avg_reward def get_q_loss(self, transitions,device): transitions = transitions.to(device) B=transitions.n_elems() Bv=torch.arange(B) action = transitions["action"] reward = transitions["_reward"] frame = transitions["frame"] _frame = transitions["_frame"] _done = transitions["_done"].float() # action for s_prime mean_prime,var_prime=self.learning_model(_frame) _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1) # _nvar = var_prime.unsqueeze(-1).repeat(1, 1, self.action_dim) # _nvar = _nvar * _id distribution=torch.distributions.Normal(mean_prime, var_prime) next_action=distribution.sample().detach() #Compute targets q1=self.target_q1(_frame,next_action).detach().squeeze(-1) q2=self.target_q2(_frame,next_action).detach().squeeze(-1) q = torch.min(q1,q2) lp= distribution.log_prob(next_action).detach().sum(-1) q = q - self.config["lambda_entropy"]*lp target_value=q*(1.-_done)*self.config["discount_factor"]+reward q1_loss=(target_value.detach()-self.q1(frame,action).squeeze(-1))**2 q2_loss=(target_value.detach()-self.q2(frame,action).squeeze(-1))**2 dt ={ "q1_loss": q1_loss.mean(), "q2_loss": q2_loss.mean(), } return DictTensor(dt),transitions def get_policy_loss(self,transitions): frame = transitions["frame"] B=transitions.n_elems() #Now, compute the policy term mean,var=self.learning_model(frame) #print(var.mean().item()) #print(mean) _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1) # _nvar = var.unsqueeze(-1).repeat(1, 1, self.action_dim) # _nvar = _nvar * _id distribution=torch.distributions.Normal(mean, var) entropy=distribution.entropy().mean() action_tilde=distribution.rsample() #print(action_tilde) q1 = self.q1(frame,action_tilde).squeeze(-1) q2 = self.q2(frame,action_tilde).squeeze(-1) q=torch.min(q1,q2) loss=q-self.config["lambda_entropy"]*distribution.log_prob(action_tilde).sum(-1) dt={"policy_loss":-loss.mean(),"entropy":entropy.detach(),"avg_var":var.mean().detach(),"avg_mean":mean.mean().detach()} dt=DictTensor(dt) return dt
class PPO(BaseExperiment): def __init__(self, config, create_env, create_agent): super().__init__(config,create_env,create_agent) env = self._create_env( self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")} ) self.n_actions = env.action_space.n self.obs_dim = env.reset()[0]["frame"].size()[1] del env def check_arguments(self,args): assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0 assert args["evaluation_mode"]=="deterministic" or args["evaluation_mode"]=="stochastic" return True def reset(self): #Creation of the batchers model=copy.deepcopy(self.learning_model) print(self.config) self.train_batcher=Batcher( n_timesteps=self.config["learning_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"train", "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_threads"], seeds=self.config["env_seed"], ) model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_rollouts"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"evaluation", "max_episode_steps": self.config["max_episode_steps"], "n_envs": self.config["n_envs"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_evaluation_threads"], seeds=self.config["env_seed"]*10, ) self.register_batcher(self.train_batcher) self.register_batcher(self.evaluation_batcher) def _state_dict(self,model,device): sd = model.state_dict() for k, v in sd.items(): sd[k] = v.to(device) return sd def run(self): device = torch.device(self.config["learner_device"]) self.learning_model.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.train_batcher.update(cpu_parameters) self.evaluation_batcher.update(cpu_parameters) n_episodes=self.config["n_evaluation_rollouts"] self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}), n_episodes=n_episodes) # Initialize the train batcher n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)})) _start_time=time.time() while time.time()-_start_time<self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() avg_reward = 0 for K in range(self.config["k_epochs"]): optimizer.zero_grad() dt = self.get_loss(trajectories) [ self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration) for k in dt.keys() ] # Computation of final loss ld = self.config["coef_critic"] * dt["value_loss"] lr = self.config["coef_ppo"] * dt["ppo_loss"] le = self.config["coef_entropy"] * dt["entropy_loss"] floss = ld - le - lr floss.backward() if self.config["clip_grad"] > 0: n = torch.nn.utils.clip_grad_norm_( self.learning_model.parameters(), self.config["clip_grad"] ) self.logger.add_scalar("grad_norm", n.item(), self.iteration) optimizer.step() self.evaluate() self.iteration+=1 cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.train_batcher.update(cpu_parameters) self.evaluate() self.iteration+=1 def evaluate(self,relaunch=True): evaluation_trajectories = self.evaluation_batcher.get(blocking=False) if (evaluation_trajectories is None): return avg_reward = ( ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask() ) .sum(1) .mean() .item() ) self.logger.add_scalar("avg_reward/"+self.config["evaluation_mode"], avg_reward, self.iteration) if (self.config["verbose"]): print("Iteration "+str(self.iteration)+", Reward = "+str(avg_reward)) if (relaunch): cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.evaluation_batcher.update(cpu_parameters) self.evaluation_batcher.reexecute() return avg_reward def get_loss(self, trajectories): device=self.config["learner_device"] trajectories = trajectories.to(device) max_length = trajectories.lengths.max().item() assert trajectories.lengths.eq(max_length).all() actions = trajectories["action"] actions_probabilities = trajectories["action_probabilities"] reward = trajectories["_reward"] frame = trajectories["frame"] last_action = trajectories["last_action"] done = trajectories["_done"].float() # Re compute model on trajectories n_action_scores = [] n_values = [] hidden_state = trajectories["agent_state"][:, 0] for T in range(max_length): hidden_state = masked_tensor(hidden_state,trajectories["agent_state"][:, T],trajectories["initial_state"][:, T]) _as, _v, hidden_state = self.learning_model( hidden_state, frame[:, T], last_action[:, T] ) n_action_scores.append(_as.unsqueeze(1)) n_values.append(_v.unsqueeze(1)) n_action_scores = torch.cat(n_action_scores, dim=1) n_values = torch.cat( [*n_values, torch.zeros(trajectories.n_elems(), 1, 1).to(device)], dim=1 ).squeeze(-1) # Compute value function for last state _idx = torch.arange(trajectories.n_elems()).to(device) _hidden_state = hidden_state.detach() #trajectories["_agent_state"][_idx, trajectories.lengths - 1] _frame = trajectories["_frame"][_idx, trajectories.lengths - 1] _last_action = trajectories["_last_action"][_idx, trajectories.lengths - 1] _, _v, _ = self.learning_model(_hidden_state, _frame, _last_action) n_values[_idx, trajectories.lengths] = _v.squeeze(-1) advantage = self.get_gae( trajectories, n_values, discount_factor=self.config["discount_factor"], _lambda=self.config["gae_lambda"], ) value_loss = advantage ** 2 avg_value_loss = value_loss.mean() n_action_probabilities = torch.softmax(n_action_scores, dim=2) n_action_distribution = torch.distributions.Categorical(n_action_probabilities) log_a=torch.distributions.Categorical(actions_probabilities).log_prob(actions) log_na=n_action_distribution.log_prob(actions) ratios=torch.exp(log_na-log_a) surr1 = ratios * advantage surr2 = torch.clamp(ratios,1-self.config["eps_clip"],1-self.config["eps_clip"])*advantage ppo_loss = torch.min(surr1,surr2) avg_ppo_loss = ppo_loss.mean() entropy_loss = n_action_distribution.entropy() avg_entropy_loss = entropy_loss.mean() dt = DictTensor( { "entropy_loss": avg_entropy_loss, "ppo_loss": avg_ppo_loss, "value_loss": avg_value_loss, } ) return dt def get_gae(self, trajectories, values, discount_factor=1, _lambda=0): r = trajectories["_reward"] v = values[:, 1:].detach() d = trajectories["_done"].float() delta = r + discount_factor * v * (1.0 - d) - values[:, :-1] T = trajectories.lengths.max().item() gae = delta[:, -1] gaes = [gae] for t in range(T - 2, -1, -1): gae = delta[:, t] + discount_factor * _lambda * (1 - d[:, t]) * gae gaes.append(gae) gaes = list([g.unsqueeze(-1) for g in reversed(gaes)]) fgae = torch.cat(gaes, dim=1) return fgae
def run(self): # Instantiate the learning model abd the baseline model self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16) self.baseline_model = BaselineModel(self.obs_dim, 16) #We create a batcher dedicated to evaluation model = copy.deepcopy(self.learning_model) self.evaluation_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_evaluation_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_evaluation_threads"]) ], ) #Creation of the batcher for sampling complete episodes (i.e Episode Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model = copy.deepcopy(self.learning_model) self.train_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_envs"] * self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_threads"]) ], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential( self.learning_model, self.baseline_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time = time.time() self.iteration = 0 #We launch the evaluation batcher (in deterministic mode) n_episodes = self.config["n_evaluation_episodes"] agent_info = DictTensor( {"stochastic": torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) self.evaluation_iteration = self.iteration while (time.time() - _start_time < self.config["time_limit"]): #Update the batcher with the last version of the learning model self.train_batcher.update(self.learning_model.state_dict()) #Call the batcher to get a sample of trajectories #1) The policy will be executed in "stochastic' mode n_episodes = self.config["n_envs"] * self.config["n_threads"] agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) #2) We get the trajectories (and wait until the trajectories have been sampled) trajectories = self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt = self.get_loss(trajectories) [ self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys() ] # Computation of final loss ld = self.config["baseline_coef"] * dt["baseline_loss"] lr = self.config["reinforce_coef"] * dt["reinforce_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) print("At iteration %d, avg (discounted) reward is %f" % (self.iteration, dt["avg_reward"].item())) print("\t Avg trajectory length is %f" % (trajectories.lengths.float().mean().item())) print( "\t Curves can be visualized using 'tensorboard --logdir=%s'" % self.config["logdir"]) self.iteration += 1 #We check the evaluation batcher evaluation_trajectories = self.evaluation_batcher.get( blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward = ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward", cumulated_reward.item(), self.evaluation_iteration) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update( self.learning_model.state_dict()) self.evaluation_iteration = self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close()
class Reinforce: def __init__(self, config, create_env, create_agent): self.config = config # Creation of the Logger (that saves in tensorboard and CSV) self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config) self._create_env = create_env self._create_agent = create_agent #Creation of one env instance to get the dimensionnality of observations and number of actions env = self._create_env(self.config["n_envs"], seed=0, env_name=self.config["env_name"]) self.n_actions = env.action_space.n self.obs_dim = env.reset()[0]["frame"].size()[1] del env def run(self): # Instantiate the learning model abd the baseline model self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16) self.baseline_model = BaselineModel(self.obs_dim, 16) #We create a batcher dedicated to evaluation model = copy.deepcopy(self.learning_model) self.evaluation_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_evaluation_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_evaluation_threads"]) ], ) #Creation of the batcher for sampling complete episodes (i.e Episode Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model = copy.deepcopy(self.learning_model) self.train_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_envs"] * self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_threads"]) ], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential( self.learning_model, self.baseline_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time = time.time() self.iteration = 0 #We launch the evaluation batcher (in deterministic mode) n_episodes = self.config["n_evaluation_episodes"] agent_info = DictTensor( {"stochastic": torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) self.evaluation_iteration = self.iteration while (time.time() - _start_time < self.config["time_limit"]): #Update the batcher with the last version of the learning model self.train_batcher.update(self.learning_model.state_dict()) #Call the batcher to get a sample of trajectories #1) The policy will be executed in "stochastic' mode n_episodes = self.config["n_envs"] * self.config["n_threads"] agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) #2) We get the trajectories (and wait until the trajectories have been sampled) trajectories = self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt = self.get_loss(trajectories) [ self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys() ] # Computation of final loss ld = self.config["baseline_coef"] * dt["baseline_loss"] lr = self.config["reinforce_coef"] * dt["reinforce_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) print("At iteration %d, avg (discounted) reward is %f" % (self.iteration, dt["avg_reward"].item())) print("\t Avg trajectory length is %f" % (trajectories.lengths.float().mean().item())) print( "\t Curves can be visualized using 'tensorboard --logdir=%s'" % self.config["logdir"]) self.iteration += 1 #We check the evaluation batcher evaluation_trajectories = self.evaluation_batcher.get( blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward = ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward", cumulated_reward.item(), self.evaluation_iteration) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update( self.learning_model.state_dict()) self.evaluation_iteration = self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close() def get_loss(self, trajectories): #First, we want to compute the cumulated reward per trajectory #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory # The 'reward' field corresopnds to the reward at time t reward = trajectories["_reward"] #We get the mask that tells which transition is in a trajectory (1) or not (0) mask = trajectories.mask() #We remove the reward values that are not in the trajectories reward = reward * mask #We compute the future cumulated reward at each timestep (by reverse computation) max_length = trajectories.lengths.max().item() cumulated_reward = torch.zeros_like(reward) cumulated_reward[:, max_length - 1] = reward[:, max_length - 1] for t in range(max_length - 2, -1, -1): cumulated_reward[:, t] = reward[:, t] + self.config[ "discount_factor"] * cumulated_reward[:, t + 1] #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward' action_probabilities = [] for t in range(max_length): proba = self.learning_model(trajectories["frame"][:, t]) action_probabilities.append( proba.unsqueeze(1) ) # We append the probability, and introduces the temporal dimension (2nde dimension) action_probabilities = torch.cat( action_probabilities, dim=1) #Now, we have a B x T x n_actions tensor #We compute the baseline baseline = [] for t in range(max_length): b = self.baseline_model(trajectories["frame"][:, t]) baseline.append(b.unsqueeze(1)) baseline = torch.cat(baseline, dim=1).squeeze(-1) #Now, we have a B x T tensor #We compute the baseline loss baseline_loss = (baseline - cumulated_reward)**2 #We sum the loss for each episode (considering the mask) baseline_loss = (baseline_loss * mask).sum(1) / mask.sum(1) #We average the loss over all the trajectories avg_baseline_loss = baseline_loss.mean() #We do the same on the reinforce loss action_distribution = torch.distributions.Categorical( action_probabilities) log_proba = action_distribution.log_prob(trajectories["action"]) reinforce_loss = log_proba * (cumulated_reward - baseline).detach() reinforce_loss = (reinforce_loss * mask).sum(1) / mask.sum(1) avg_reinforce_loss = reinforce_loss.mean() #We compute the entropy loss entropy = action_distribution.entropy() entropy = (entropy * mask).sum(1) / mask.sum(1) avg_entropy = entropy.mean() return DictTensor({ "avg_reward": cumulated_reward[:, 0].mean(), "baseline_loss": avg_baseline_loss, "reinforce_loss": avg_reinforce_loss, "entropy_loss": avg_entropy })
class DQN(BaseExperiment): def __init__(self, config, create_env, create_agent): super().__init__(config,create_env,create_agent) env = self._create_env( self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")} ) self.n_actions = env.action_space.n self.obs_dim = env.reset()[0]["frame"].size()[1] del env def check_arguments(self,args): assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0 return True def reset(self): self.target_model=copy.deepcopy(self.learning_model) model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["batch_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"train", "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_threads"], seeds=self.config["env_seed"], ) model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_rollouts"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "mode":"evaluation", "max_episode_steps": self.config["max_episode_steps"], "n_envs": self.config["n_envs"], **{k:self.config[k] for k in self.config if k.startswith("environment/")} }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_evaluation_threads"], seeds=self.config["env_seed"]*10, ) self.register_batcher(self.train_batcher) self.register_batcher(self.evaluation_batcher) def _state_dict(self,model,device): sd = model.state_dict() for k, v in sd.items(): sd[k] = v.to(device) return sd def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.target_model.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]})) logging.info("Sampling initial transitions") for k in range(self.config["initial_buffer_epochs"]): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) n_episodes=self.config["n_evaluation_rollouts"] self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["qvalue_epochs"]): optimizer.zero_grad() dt = self.get_loss(device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] floss=dt["q_loss"] floss.backward() if self.config["clip_grad"] > 0: n = torch.nn.utils.clip_grad_norm_( self.learning_model.parameters(), self.config["clip_grad"] ) self.logger.add_scalar("grad_norm", n.item(), self.iteration) self.iteration+=1 optimizer.step() tau=self.config["tau"] self.soft_update_params(self.learning_model,self.target_model,tau) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate() self.iteration+=1 def soft_update_params(self,net, target_net, tau): for param, target_param in zip(net.parameters(), target_net.parameters()): target_param.data.copy_(tau * param.data +(1 - tau) * target_param.data) def evaluate(self,relaunch=True): evaluation_trajectories = self.evaluation_batcher.get(blocking=False) if (evaluation_trajectories is None): return avg_reward = ( ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask() ) .sum(1) .mean() .item() ) self.logger.add_scalar("avg_reward", avg_reward, self.iteration) if (self.config["verbose"]): print("Iteration "+str(self.iteration)+", Reward = "+str(avg_reward)+", Buffer size = "+str(self.replay_buffer.size())) if (relaunch): self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.reexecute() return avg_reward def get_loss(self, device): transitions=self.replay_buffer.sample(n=self.config["n_batches"]) transitions = transitions.to(device) B=transitions.n_elems() Bv=torch.arange(B) action = transitions["action"] reward = transitions["_reward"] frame = transitions["frame"] _frame = transitions["_frame"] _done = transitions["_done"].float() q=self.learning_model(frame) qa=q[Bv,action] qp = self.learning_model(_frame) actionp=qp.max(1)[1] _q_target = self.target_model(_frame).detach() _q_target_a= _q_target[Bv,actionp] _target_value=_q_target_a*(1-_done)*self.config["discount_factor"]+reward td = (_target_value-qa)**2 dt = DictTensor( { "q_loss": td.mean(), } ) return dt
if __name__ == "__main__": import torch.multiprocessing as mp mp.set_start_method("spawn") # The **EpisodeBatcher** will sample full episodes (until the environment returns done==True) # If one consider a rlstructures.VecEnv env, and n_threads (or processes), then the batcher will sample n_episodes = N * env.n_envs()*n_threads episodes at each execution (where N is chosen by the user) # *seeds* is a list of environment seeds, one seed per process # The batcher has to be configured 'at the right size' since all the processes are sharing a common *Buffer* to store trajectories # The simplest case is to choose *n_slots=maximun number of acquired episodes* and *n_timesteps=max size of the episodes*. We will describe later the exact meaning of theses arguments. batcher=EpisodeBatcher( n_timesteps=100, n_slots=128, n_threads=4, seeds=[1,2,3,4], create_agent=create_agent, agent_args={"n_actions":2}, create_env=create_env, env_args={"max_episode_steps":100} ) # Execution will start the acqusition process (such that some other computations can be done in parallel to the episodes acquisition) n_episodes=32 # Since we will sample 32 episodes, we need to configure the 32 agents and 32 environments that will interact agent_info=DictTensor({"agent_id":torch.arange(32)}) env_info=DictTensor({"env_id":torch.arange(32)}) #Running the batcher. It is a non-blocking function that launch the acqusition