def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.target_model.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]})) logging.info("Sampling initial transitions") for k in range(self.config["initial_buffer_epochs"]): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) n_episodes=self.config["n_evaluation_rollouts"] self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["qvalue_epochs"]): optimizer.zero_grad() dt = self.get_loss(device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] floss=dt["q_loss"] floss.backward() if self.config["clip_grad"] > 0: n = torch.nn.utils.clip_grad_norm_( self.learning_model.parameters(), self.config["clip_grad"] ) self.logger.add_scalar("grad_norm", n.item(), self.iteration) self.iteration+=1 optimizer.step() tau=self.config["tau"] self.soft_update_params(self.learning_model,self.target_model,tau) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate() self.iteration+=1
def __call__(self, state, observation, agent_info=None, history=None): """ Executing one step of the agent """ # Verify that the batch size is 1 initial_state = observation["initial_state"] B = observation.n_elems() if agent_info is None: agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(B)}) model_initial_state = self.model.initial_state(B) agent_state = None agent_step = None if state is None: assert initial_state.all() agent_state = model_initial_state agent_step = torch.zeros(B).long() else: _is = (initial_state.float().unsqueeze(-1).repeat( 1, model_initial_state.size()[1])) agent_state = _is * model_initial_state + ( 1 - _is) * state["agent_state"] agent_step = ( initial_state.float() * torch.zeros(B) + (1 - initial_state.float()) * state["agent_step"]).long() score_action, value, next_state = self.model( agent_state, observation["frame"], observation["last_action"]) action_proba = torch.softmax(score_action, dim=1) dist = torch.distributions.Categorical(action_proba) action_sampled = dist.sample() action_max = action_proba.max(1)[1] smask = agent_info["stochastic"].float() action = (action_sampled * smask + (1 - smask) * action_max).long() new_state = DictTensor({ "agent_state": next_state, "agent_step": agent_step + 1 }) agent_do = DictTensor({ "action": action, "action_probabilities": action_proba }) state = DictTensor({ "agent_state": agent_state, "agent_step": agent_step }) return state, agent_do, new_state
def reset(self,agent_info=DictTensor({}), env_info=DictTensor({})): n_workers = len(self.workers) assert isinstance(agent_info,DictTensor) and (agent_info.empty() or agent_info.n_elems()==self.n_envs*n_workers) assert isinstance(env_info,DictTensor) and (env_info.empty() or env_info.n_elems()==self.n_envs*n_workers) pos=0 for k in range(n_workers): n=self.n_envs wi=None if agent_info is None else agent_info.slice(pos,pos+n) ei= None if env_info is None else env_info.slice(pos,pos+n) self.workers[k].reset( agent_info=wi, env_info=ei ) pos+=n
def __call__(self, state, observation, agent_info=None, history=None): """ Executing one step of the agent """ # Verify that the batch size is 1 initial_state = observation["initial_state"] B = observation.n_elems() if agent_info is None: agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(B)}) # Create the initial state of the recurrent policy agent_initial = self.model.initial_state(B) if (state is None): # If the batcher is starting state = DictTensor({ "agent_state": agent_initial, "agent_step": torch.zeros(B).long() }) else: #Maybe some observations are initial states of new episodes. For these state, we must initialize the internal state of the policy istate = DictTensor({ "agent_state": agent_initial, "agent_step": torch.zeros(B).long() }) state = masked_dicttensor(istate, state, initial_state) new_z, action_proba = self.model(state["agent_state"], observation["frame"]) #We sample an action following the distribution dist = torch.distributions.Categorical(action_proba) action_sampled = dist.sample() #Depending on the agent_info variable that tells us if we are in 'stochastic' or 'deterministic' mode, we keep the sampled action, or compute the action with the max score action_max = action_proba.max(1)[1] smask = agent_info["stochastic"].float() action = masked_tensor(action_max, action_sampled, agent_info["stochastic"]) new_state = DictTensor({ "agent_state": new_z, "agent_step": state["agent_step"] + 1 }) agent_do = DictTensor({ "action": action, "action_probabilities": action_proba }) return state, agent_do, new_state
def __call__(self, state, observation, agent_info=None, history=None): B = observation.n_elems() agent_state = None if state is None: agent_state = DictTensor({"timestep": torch.zeros(B).long()}) else: agent_state = state scores = torch.randn(B, self.n_actions) probabilities = torch.softmax(scores, dim=1) actions = torch.distributions.Categorical(probabilities).sample() new_state = DictTensor({"timestep": agent_state["timestep"] + 1}) return agent_state, DictTensor({"action": actions}), new_state
def step(self, policy_output): assert policy_output.n_elems() == self.envs_running.size()[0] outputs = policy_output.unfold() alls = [] env_run = {} for b in range(len(outputs)): idx_env = self.envs_running[b] action = policy_output["action"][b] last_action = action if (isinstance(self.gym_envs[0].action_space, gym.spaces.Discrete)): action = action.item() last_action = last_action.unsqueeze(0) else: action = action.tolist() last_action = last_action.unsqueeze(0) initial_state = torch.tensor([False]) act = action frame, reward, done, unused_info = self.gym_envs[idx_env].step(act) reward = torch.tensor([reward]) frame = format_frame(frame) if isinstance(frame, torch.Tensor): frame = {"frame": frame} if not done: env_run[b] = idx_env done = torch.tensor([done]) r = DictTensor({ "reward": reward, "done": done, "initial_state": initial_state, "last_action": last_action, **frame, }) alls.append(r) d = DictTensor.cat(alls) keys = [] values = [] for key, value in env_run.items(): keys.append(key) values.append(value) dd = d.index(torch.tensor(keys).long()) old_envs_running = self.envs_running self.envs_running = torch.tensor(values) return (d, old_envs_running), (dd, self.envs_running)
def run(self): device = torch.device(self.config["learner_device"]) self.learning_model.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.train_batcher.update(cpu_parameters) self.evaluation_batcher.update(cpu_parameters) n_episodes=self.config["n_evaluation_rollouts"] self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}), n_episodes=n_episodes) # Initialize the train batcher n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)})) _start_time=time.time() while time.time()-_start_time<self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() avg_reward = 0 for K in range(self.config["k_epochs"]): optimizer.zero_grad() dt = self.get_loss(trajectories) [ self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration) for k in dt.keys() ] # Computation of final loss ld = self.config["coef_critic"] * dt["value_loss"] lr = self.config["coef_ppo"] * dt["ppo_loss"] le = self.config["coef_entropy"] * dt["entropy_loss"] floss = ld - le - lr floss.backward() if self.config["clip_grad"] > 0: n = torch.nn.utils.clip_grad_norm_( self.learning_model.parameters(), self.config["clip_grad"] ) self.logger.add_scalar("grad_norm", n.item(), self.iteration) optimizer.step() self.evaluate() self.iteration+=1 cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu")) self.train_batcher.update(cpu_parameters) self.evaluate() self.iteration+=1
def sample(self,n=1): limit=self.pos if self.full: limit=self.N transitions=torch.randint(0,high=limit,size=(n,)) d={k:self.buffer[k][transitions] for k in self.buffer} return DictTensor(d)
def reset(self, env_info: DictTensor = DictTensor({})): """ reset the environments instances :param env_info: a DictTensor of size n_envs, such that each value will be transmitted to each environment instance :type env_info: DictTensor, optional """ pass
def get_loss(self, device): transitions=self.replay_buffer.sample(n=self.config["n_batches"]) transitions = transitions.to(device) B=transitions.n_elems() Bv=torch.arange(B) action = transitions["action"] reward = transitions["_reward"] frame = transitions["frame"] _frame = transitions["_frame"] _done = transitions["_done"].float() q=self.learning_model(frame) qa=q[Bv,action] qp = self.learning_model(_frame) actionp=qp.max(1)[1] _q_target = self.target_model(_frame).detach() _q_target_a= _q_target[Bv,actionp] _target_value=_q_target_a*(1-_done)*self.config["discount_factor"]+reward td = (_target_value-qa)**2 dt = DictTensor( { "q_loss": td.mean(), } ) return dt
def get_q_loss(self, transitions,device): transitions = transitions.to(device) B=transitions.n_elems() Bv=torch.arange(B) action = transitions["action"] reward = transitions["_reward"] frame = transitions["frame"] _frame = transitions["_frame"] _done = transitions["_done"].float() # action for s_prime mean_prime,var_prime=self.learning_model(_frame) _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1) # _nvar = var_prime.unsqueeze(-1).repeat(1, 1, self.action_dim) # _nvar = _nvar * _id distribution=torch.distributions.Normal(mean_prime, var_prime) next_action=distribution.sample().detach() #Compute targets q1=self.target_q1(_frame,next_action).detach().squeeze(-1) q2=self.target_q2(_frame,next_action).detach().squeeze(-1) q = torch.min(q1,q2) lp= distribution.log_prob(next_action).detach().sum(-1) q = q - self.config["lambda_entropy"]*lp target_value=q*(1.-_done)*self.config["discount_factor"]+reward q1_loss=(target_value.detach()-self.q1(frame,action).squeeze(-1))**2 q2_loss=(target_value.detach()-self.q2(frame,action).squeeze(-1))**2 dt ={ "q1_loss": q1_loss.mean(), "q2_loss": q2_loss.mean(), } return DictTensor(dt),transitions
def __call__(self, state, observation,agent_info=None,history=None): """ Executing one step of the agent """ # Verify that the batch size is 1 initial_state = observation["initial_state"] B = observation.n_elems() if agent_info is None: agent_info=DictTensor({"epsilon":torch.zeros(B)}) agent_step = None if state is None: assert initial_state.all() agent_step = torch.zeros(B).long() else: agent_step = ( initial_state.float() * torch.zeros(B) + (1 - initial_state.float()) * state["agent_step"] ).long() q = self.model( observation["frame"] ) qs,action = q.max(1) raction = torch.tensor(np.random.randint(low=0,high=self.n_actions,size=(action.size()[0]))) epsilon=agent_info["epsilon"] mask=torch.rand(action.size()[0]).lt(epsilon).float() action=mask*raction+(1-mask)*action action=action.long() new_state = DictTensor( {"agent_step": agent_step + 1} ) agent_do = DictTensor( {"action": action, "q": q} ) state = DictTensor({"agent_step": agent_step}) return state, agent_do, new_state
def execute(self, n_episodes, agent_info=DictTensor({}),env_info=DictTensor({})): n_workers = len(self.workers) assert n_episodes % (self.n_envs*n_workers) == 0 assert isinstance(agent_info,DictTensor) and (agent_info.empty() or agent_info.n_elems()==n_episodes) assert isinstance(env_info,DictTensor) and (env_info.empty() or env_info.n_elems()==n_episodes) self.n_per_worker = [int(n_episodes / n_workers) for w in range(n_workers)] pos=0 for k in range(n_workers): n=self.n_per_worker[k] assert n%self.n_envs==0 wi=agent_info.slice(pos,pos+n) ei=env_info.slice(pos,pos+n) self.workers[k].acquire_episodes( n_episodes=self.n_per_worker[k], agent_info=wi, env_info=ei ) pos+=n assert pos==n_episodes
def get_loss(self,trajectories): #First, we want to compute the cumulated reward per trajectory #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory # The 'reward' field corresopnds to the reward at time t reward=trajectories["_reward"] #We get the mask that tells which transition is in a trajectory (1) or not (0) mask=trajectories.mask() #We remove the reward values that are not in the trajectories reward=reward*mask max_length=trajectories.lengths.max().item() #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward' action_probabilities=[] for t in range(max_length): proba=self.learning_model(trajectories["frame"][:,t]) action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension) action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor #We compute the critic value for t=0 to T (i.e including the very last observation) critic=[] for t in range(max_length): b=self.critic_model(trajectories["frame"][:,t]) critic.append(b.unsqueeze(1)) critic=torch.cat(critic+[b.unsqueeze(1)],dim=1).squeeze(-1) #Now, we have a B x (T+1) tensor #We also need to compute the critic value at for the last observation of the trajectories (to compute the TD) # It may be the last element of the trajectories (if episode is not finished), or on the last frame of the episode idx=torch.arange(trajectories.n_elems()) last_critic=self.critic_model(trajectories["_frame"][idx,trajectories.lengths-1]).squeeze(-1) critic[idx,trajectories.lengths]=last_critic #We compute the temporal difference target=reward+self.config["discount_factor"]*(1-trajectories["_done"].float())*critic[:,1:].detach() td=critic[:,:-1]-target critic_loss=td**2 #We sum the loss for each episode (considering the mask) critic_loss= (critic_loss*mask).sum(1)/mask.sum(1) #We average the loss over all the trajectories avg_critic_loss = critic_loss.mean() #We do the same on the reinforce loss action_distribution=torch.distributions.Categorical(action_probabilities) log_proba=action_distribution.log_prob(trajectories["action"]) a2c_loss = -log_proba * td.detach() a2c_loss = (a2c_loss*mask).sum(1)/mask.sum(1) avg_a2c_loss=a2c_loss.mean() #We compute the entropy loss entropy=action_distribution.entropy() entropy=(entropy*mask).sum(1)/mask.sum(1) avg_entropy=entropy.mean() return DictTensor({"critic_loss":avg_critic_loss,"a2c_loss":avg_a2c_loss,"entropy_loss":avg_entropy})
def __call__(self,state,observation,agent_info=None,history=None): B=observation.n_elems() agent_state=None #Initialize agent_info is not specified if agent_info is None: agent_info=DictTensor({"agent_id":torch.tensor([0]).repeat(B)}) #initialize the state of the agent if not specified if state is None: agent_state=DictTensor({"timestep":torch.zeros(B).long()}) else: agent_state=state scores=torch.randn(B,self.n_actions) probabilities=torch.softmax(scores,dim=1) actions=torch.distributions.Categorical(probabilities).sample() new_state=DictTensor({"timestep":agent_state["timestep"]+1}) # We also decide to output the action probabilities return agent_state,DictTensor({"action":actions,"action_probabilities":probabilities,"agent_id":agent_info["agent_id"]}),new_state
def get_loss(self,trajectories): #First, we want to compute the cumulated reward per trajectory #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory # The 'reward' field corresopnds to the reward at time t reward=trajectories["_reward"] #We get the mask that tells which transition is in a trajectory (1) or not (0) mask=trajectories.mask() #We remove the reward values that are not in the trajectories reward=reward*mask #We compute the future cumulated reward at each timestep (by reverse computation) max_length=trajectories.lengths.max().item() cumulated_reward=torch.zeros_like(reward) cumulated_reward[:,max_length-1]=reward[:,max_length-1] for t in range(max_length-2,-1,-1): cumulated_reward[:,t]=reward[:,t]+self.config["discount_factor"]*cumulated_reward[:,t+1] #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward' action_probabilities=[] for t in range(max_length): proba=self.learning_model(trajectories["frame"][:,t]) action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension) action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor #We compute the baseline baseline=[] for t in range(max_length): b=self.baseline_model(trajectories["frame"][:,t]) baseline.append(b.unsqueeze(1)) baseline=torch.cat(baseline,dim=1).squeeze(-1) #Now, we have a B x T tensor #We compute the baseline loss baseline_loss=(baseline-cumulated_reward)**2 #We sum the loss for each episode (considering the mask) baseline_loss= (baseline_loss*mask).sum(1)/mask.sum(1) #We average the loss over all the trajectories avg_baseline_loss = baseline_loss.mean() #We do the same on the reinforce loss action_distribution=torch.distributions.Categorical(action_probabilities) log_proba=action_distribution.log_prob(trajectories["action"]) reinforce_loss = log_proba * (cumulated_reward-baseline).detach() reinforce_loss = (reinforce_loss*mask).sum(1)/mask.sum(1) avg_reinforce_loss=reinforce_loss.mean() #We compute the entropy loss entropy=action_distribution.entropy() entropy=(entropy*mask).sum(1)/mask.sum(1) avg_entropy=entropy.mean() return DictTensor({"avg_reward":cumulated_reward[:,0].mean(),"baseline_loss":avg_baseline_loss,"reinforce_loss":avg_reinforce_loss,"entropy_loss":avg_entropy})
def __call__(self, state, observation, agent_info=None, history=None): """ Executing one step of the agent """ # Verify that the batch size is 1 initial_state = observation["initial_state"] B = observation.n_elems() if agent_info is None: agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(B)}) agent_step = None if state is None: assert initial_state.all() agent_step = torch.zeros(B).long() else: agent_step = ( initial_state.float() * torch.zeros(B) + (1 - initial_state.float()) * state["agent_step"]).long() _mean, _var = self.model(observation["frame"]) _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1) # _nvar = _var.unsqueeze(-1).repeat(1, 1, self.action_dim) # _nvar = _nvar * _id distribution = torch.distributions.Normal(_mean, _var) action_sampled = distribution.sample() action_max = _mean smask = agent_info["stochastic"].float().unsqueeze(-1).repeat( 1, self.action_dim) action = (action_sampled * smask + (1.0 - smask) * action_max) new_state = DictTensor({"agent_step": agent_step + 1}) agent_do = DictTensor({"action": action, "mean": _mean, "std": _var}) state = DictTensor({"agent_step": agent_step}) return state, agent_do, new_state
def reset(self, env_info=DictTensor({})): N = self.n_envs() self.envs_running = torch.arange(N) reward = torch.zeros(N) last_action = None if (isinstance(self.gym_envs[0].action_space, gym.spaces.Discrete)): last_action = torch.zeros(N, dtype=torch.int64) else: a = self.gym_envs[0].action_space.sample() a = torch.tensor(a).unsqueeze(0).repeat(N, 1) last_action = a done = torch.zeros(N).bool() initial_state = torch.ones(N).bool() frames = None if (env_info.empty()): frames = [format_frame(e.reset()) for e in self.gym_envs] else: frames = [] for n in range(len(self.gym_envs)): v = {k: env_info[k][n].tolist() for k in env_info.keys()} frames.append(format_frame(self.gym_envs[n].reset(env_info=v))) _frames = [] for f in frames: if isinstance(f, torch.Tensor): _frames.append({"frame": f}) else: _frames.append(f) frames = [DictTensor(_f) for _f in _frames] frames = DictTensor.cat(frames) frames.set("reward", reward) frames.set("done", done) frames.set("initial_state", initial_state) frames.set("last_action", last_action) return frames, self.envs_running
def get_policy_loss(self,transitions): frame = transitions["frame"] B=transitions.n_elems() #Now, compute the policy term mean,var=self.learning_model(frame) #print(var.mean().item()) #print(mean) _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1) # _nvar = var.unsqueeze(-1).repeat(1, 1, self.action_dim) # _nvar = _nvar * _id distribution=torch.distributions.Normal(mean, var) entropy=distribution.entropy().mean() action_tilde=distribution.rsample() #print(action_tilde) q1 = self.q1(frame,action_tilde).squeeze(-1) q2 = self.q2(frame,action_tilde).squeeze(-1) q=torch.min(q1,q2) loss=q-self.config["lambda_entropy"]*distribution.log_prob(action_tilde).sum(-1) dt={"policy_loss":-loss.mean(),"entropy":entropy.detach(),"avg_var":var.mean().detach(),"avg_mean":mean.mean().detach()} dt=DictTensor(dt) return dt
def run(self): # Instantiate the learning model abd the baseline model self.learning_model=AgentModel(self.obs_dim,self.n_actions,32) self.critic_model=BaselineModel(self.obs_dim,32) #We create a batcher dedicated to evaluation model=copy.deepcopy(self.learning_model) self.evaluation_batcher=EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_evaluation_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])], ) #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model=copy.deepcopy(self.learning_model) self.train_batcher=Batcher( n_timesteps=self.config["a2c_timesteps"], n_slots=self.config["n_envs"]*self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_train_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name":self.config["env_name"] }, agent_args={"n_actions": self.n_actions, "model": model}, n_threads=self.config["n_threads"], seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time=time.time() self.iteration=0 # #We launch the evaluation batcher (in deterministic mode) n_episodes=self.config["n_evaluation_episodes"] agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info) self.evaluation_iteration=self.iteration #Initialize the training batcher such that agents will start to acqire pieces of episodes self.train_batcher.update(self.learning_model.state_dict()) n_episodes=self.config["n_envs"]*self.config["n_threads"] agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.reset(agent_info=agent_info) while(time.time()-_start_time<self.config["time_limit"]): #Call the batcher to get a sample of trajectories #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode self.train_batcher.execute() trajectories=self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt=self.get_loss(trajectories) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] # Computation of final loss ld = self.config["critic_coef"] * dt["critic_loss"] lr = self.config["a2c_coef"] * dt["a2c_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr floss= floss/n_episodes*trajectories.n_elems() optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) self.iteration+=1 #We check the evaluation batcher evaluation_trajectories=self.evaluation_batcher.get(blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration) print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item())) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update(self.learning_model.state_dict()) self.evaluation_iteration=self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close()
# The **EpisodeBatcher** will sample full episodes (until the environment returns done==True) # If one consider a rlstructures.VecEnv env, and n_threads (or processes), then the batcher will sample n_episodes = N * env.n_envs()*n_threads episodes at each execution (where N is chosen by the user) # *seeds* is a list of environment seeds, one seed per process # The batcher has to be configured 'at the right size' since all the processes are sharing a common *Buffer* to store trajectories # The simplest case is: # *n_slots = env.n_envs() x n_threads * # *n_timeteps* is the number of timesteps that will be acquired at each call batcher = Batcher(n_timesteps=10, n_slots=16, n_threads=4, seeds=[1, 2, 3, 4], create_agent=create_agent, agent_args={"n_actions": 2}, create_env=create_env, env_args={"max_episode_steps": 100}) # A traajectory batcher has to be *reset* # Then calling *execute* will acquire the next T steps # The *execute* method will return *None* if all environments have stopped batcher.reset(agent_info=DictTensor({"agent_id": torch.arange(16)}), env_info=DictTensor({"env_id": torch.arange(16)})) import time batcher.execute() t = batcher.get() while not t is None: print(t.lengths) batcher.execute() t = batcher.get()
def run(self): # Instantiate the learning model abd the baseline model self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16) self.baseline_model = BaselineModel(self.obs_dim, 16) #We create a batcher dedicated to evaluation model = copy.deepcopy(self.learning_model) self.evaluation_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_evaluation_episodes"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_evaluation_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_evaluation_threads"]) ], ) #Creation of the batcher for sampling complete episodes (i.e Episode Batcher) #The batcher will sample n_threads*n_envs trajectories at each call # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"] model = copy.deepcopy(self.learning_model) self.train_batcher = EpisodeBatcher( n_timesteps=self.config["max_episode_steps"], n_slots=self.config["n_envs"] * self.config["n_threads"], create_agent=self._create_agent, create_env=self._create_env, env_args={ "n_envs": self.config["n_envs"], "max_episode_steps": self.config["max_episode_steps"], "env_name": self.config["env_name"] }, agent_args={ "n_actions": self.n_actions, "model": model }, n_threads=self.config["n_threads"], seeds=[ self.config["env_seed"] + k * 10 for k in range(self.config["n_threads"]) ], ) #Creation of the optimizer optimizer = torch.optim.Adam(nn.Sequential( self.learning_model, self.baseline_model).parameters(), lr=self.config["lr"]) #Training Loop: _start_time = time.time() self.iteration = 0 #We launch the evaluation batcher (in deterministic mode) n_episodes = self.config["n_evaluation_episodes"] agent_info = DictTensor( {"stochastic": torch.tensor([False]).repeat(n_episodes)}) self.evaluation_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) self.evaluation_iteration = self.iteration while (time.time() - _start_time < self.config["time_limit"]): #Update the batcher with the last version of the learning model self.train_batcher.update(self.learning_model.state_dict()) #Call the batcher to get a sample of trajectories #1) The policy will be executed in "stochastic' mode n_episodes = self.config["n_envs"] * self.config["n_threads"] agent_info = DictTensor( {"stochastic": torch.tensor([True]).repeat(n_episodes)}) self.train_batcher.execute(n_episodes=n_episodes, agent_info=agent_info) #2) We get the trajectories (and wait until the trajectories have been sampled) trajectories = self.train_batcher.get(blocking=True) #3) Now, we compute the loss dt = self.get_loss(trajectories) [ self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys() ] # Computation of final loss ld = self.config["baseline_coef"] * dt["baseline_loss"] lr = self.config["reinforce_coef"] * dt["reinforce_loss"] le = self.config["entropy_coef"] * dt["entropy_loss"] floss = ld - le - lr optimizer.zero_grad() floss.backward() optimizer.step() #Update the train batcher with the updated model self.train_batcher.update(self.learning_model.state_dict()) print("At iteration %d, avg (discounted) reward is %f" % (self.iteration, dt["avg_reward"].item())) print("\t Avg trajectory length is %f" % (trajectories.lengths.float().mean().item())) print( "\t Curves can be visualized using 'tensorboard --logdir=%s'" % self.config["logdir"]) self.iteration += 1 #We check the evaluation batcher evaluation_trajectories = self.evaluation_batcher.get( blocking=False) if not evaluation_trajectories is None: #trajectories are available #Compute the cumulated reward cumulated_reward = ( evaluation_trajectories["_reward"] * evaluation_trajectories.mask()).sum(1).mean() self.logger.add_scalar("evaluation_reward", cumulated_reward.item(), self.evaluation_iteration) #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes) self.evaluation_batcher.update( self.learning_model.state_dict()) self.evaluation_iteration = self.iteration self.evaluation_batcher.reexecute() self.train_batcher.close() self.evaluation_batcher.get() # To wait for the last trajectories self.evaluation_batcher.close() self.logger.update_csv() # To save as a CSV file in logdir self.logger.close()
def step(self, policy_output): assert policy_output.n_elems() == self.n_envs() outputs = policy_output.unfold() alls = [] alls_after = [] env_run = {} for b in range(len(outputs)): action = policy_output["action"][b] last_action = action if (isinstance(self.gym_envs[0].action_space, gym.spaces.Discrete)): action = action.item() last_action = last_action.unsqueeze(0) else: action = action.tolist() last_action = last_action.unsqueeze(0) initial_state = torch.tensor([False]) act = action frame, reward, done, unused_info = self.gym_envs[b].step(act) reward = torch.tensor([reward]) frame = format_frame(frame) if isinstance(frame, torch.Tensor): frame = {"frame": frame} done = torch.tensor([done]) r = DictTensor({ "reward": reward, "done": done, "initial_state": initial_state, "last_action": last_action, **frame, }) alls.append(r) if done: if "set" in dir(self.gym_envs[b]): self.gym_envs[b].set(self.env_info[b]) if self.env_info.empty(): frame = self.gym_envs[b].reset() else: v = {k: env_info[k][b].tolist() for k in env_info.keys()} frame = self.gym_envs[b].reset(env_info=v) frame = format_frame(frame) if isinstance(frame, torch.Tensor): frame = {"frame": frame} last_action = None if (isinstance(self.gym_envs[0].action_space, gym.spaces.Discrete)): last_action = torch.zeros(1, dtype=torch.int64) else: a = self.gym_envs[0].action_space.sample() a = torch.tensor([a]) last_action = a initial_state = torch.tensor([True]) reward = torch.tensor([0.0]) r = DictTensor({ "reward": reward, "done": done, "initial_state": initial_state, "last_action": last_action, **frame, }) alls_after.append(r) else: alls_after.append(r) next_observation = DictTensor.cat(alls) next_observation_next_slot = DictTensor.cat(alls_after) return ( (next_observation, torch.arange(self.n_envs())), (next_observation_next_slot, torch.arange(self.n_envs())), )
def acquire_slot( buffer, env, agent, agent_state, observation, agent_info, env_running, ): """ Run the agent to fill one slot in the buffer Args: buffer (SlottedTemporalBuffer): the buffer to store the information env (VecEnv): the environment agent (Agent): the agent agent_state (DictTensor): the current state of the agent observation (DictTensor): the observation from the agent env_running (torch.LongTensor): the mapping between batch dim (in agent_state and observation) and the env idx Return: env_to_slot (dict): a mapping from env_idx to slot indexes agent_state,observation,env_running: the state at the end of the execution if env_running.size()[0]==0: there is nothing more to run """ with torch.no_grad(): require_history = agent.require_history() B = env_running.size()[0] id_slots = buffer.get_free_slots(B) env_to_slot = { env_running[i].item(): id_slots[i] for i in range(len(id_slots)) } t = 0 for t in range(buffer.s_slots): # print(t,buffer.s_slots) _id_slots = [ env_to_slot[env_running[i].item()] for i in range(env_running.size()[0]) ] history = None if require_history: history = buffer.get_single_slots(_id_slots, erase=False) old_agent_state, agent_output, new_agent_state = agent( agent_state, observation, agent_info, history=history) # print(old_agent_state,agent_output,new_agent_state) (nobservation, env_running), (nnobservation, nenv_running) = env.step(agent_output) position_in_slot = torch.tensor([t]).repeat(len(_id_slots)) to_write = (observation + agent_output + old_agent_state + new_agent_state.prepend_key("_") + nobservation.prepend_key("_") + DictTensor({"position_in_slot": position_in_slot})) id_slots = [ env_to_slot[env_running[i].item()] for i in range(env_running.size()[0]) ] assert id_slots == _id_slots buffer.write(id_slots, to_write) # Now, let us prepare the next step observation = nnobservation idxs = [ k for k in range(env_running.size()[0]) if env_running[k].item() in nenv_running ] if len(idxs) == 0: return env_to_slot, None, None, None, nenv_running idxs = torch.tensor(idxs) agent_state = new_agent_state.index(idxs) agent_info = agent_info.index(idxs) env_running = nenv_running assert len(agent_state.keys()) == 0 or (agent_state.n_elems() == observation.n_elems()) if nenv_running.size()[0] == 0: return env_to_slot, agent_state, observation, agent_info, env_running return env_to_slot, agent_state, observation, agent_info, env_running
from rlstructures.env_wrappers import GymEnv from rlstructures import DictTensor import torch envs=[MyEnv() for k in range(4)] env=GymEnv(envs,seed=80) # Each instance of the gym.Env will be initialized with seed+i such that the multiple instances will have different seeds #Interaction with the environment is easy, but made by using DictTensor obs,who_is_still_running=env.reset() print(obs) n_running=who_is_still_running.size()[0] while n_running>0: #While some envs are still running action=DictTensor({"action":torch.tensor([0]).repeat(n_running)}) (obs,who_was_running),(obs2,who_is_still_running) = env.step(action) n_running=who_is_still_running.size()[0] print(obs2) # Note that gym wrappers work with continuous and discrete action spaces, but may not with environments where the action space is more complicated. # If you are facing gym envs with a complex action space, you may develop your own wrapper # A good starting point is the rlstructures.GymEnv code which is very simple can be used to define a new wrapper # All the other rlstuctures components will work with complex action spaces without modifications # Trajectories in RLStructures # When acquiring trajectories throug the *batcher.get* execution, one receives a **TemporalDictTensor** # * Each element of the trajectories (at time t) is a complete transition # To illustrate the structure let us consider an example:
# # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # ###### DictTensor # A DictTensor is dictionary of pytorch tensors. It assumes that the # first dimension of each tensor contained in the DictTensor is the batch dimension # The easiest way to build a DictTensor is to use a ditcionnary of tensors as input from rlstructures import DictTensor import torch d = DictTensor({"x": torch.randn(3, 5), "y": torch.randn(3, 8)}) # The number of batches is accessible through n_elems() print(d.n_elems(), " <- number of elements in the batch") # Many methods can be used over DictTensor (see DictTensor documentation) d["x"] # Returns the tensor 'x' in the DictTensor d.keys() # Returns the names of the variables of the DictTensor # An empty DictTensor can be defined as follows: d = DictTensor({}) ###### TemporalDictTensor # A TemporalDictTensor is a sequence of DictTensors. In memory, it is stored as a dictionary of tensors,
def get_loss(self, trajectories): device=self.config["learner_device"] trajectories = trajectories.to(device) max_length = trajectories.lengths.max().item() assert trajectories.lengths.eq(max_length).all() actions = trajectories["action"] actions_probabilities = trajectories["action_probabilities"] reward = trajectories["_reward"] frame = trajectories["frame"] last_action = trajectories["last_action"] done = trajectories["_done"].float() # Re compute model on trajectories n_action_scores = [] n_values = [] hidden_state = trajectories["agent_state"][:, 0] for T in range(max_length): hidden_state = masked_tensor(hidden_state,trajectories["agent_state"][:, T],trajectories["initial_state"][:, T]) _as, _v, hidden_state = self.learning_model( hidden_state, frame[:, T], last_action[:, T] ) n_action_scores.append(_as.unsqueeze(1)) n_values.append(_v.unsqueeze(1)) n_action_scores = torch.cat(n_action_scores, dim=1) n_values = torch.cat( [*n_values, torch.zeros(trajectories.n_elems(), 1, 1).to(device)], dim=1 ).squeeze(-1) # Compute value function for last state _idx = torch.arange(trajectories.n_elems()).to(device) _hidden_state = hidden_state.detach() #trajectories["_agent_state"][_idx, trajectories.lengths - 1] _frame = trajectories["_frame"][_idx, trajectories.lengths - 1] _last_action = trajectories["_last_action"][_idx, trajectories.lengths - 1] _, _v, _ = self.learning_model(_hidden_state, _frame, _last_action) n_values[_idx, trajectories.lengths] = _v.squeeze(-1) advantage = self.get_gae( trajectories, n_values, discount_factor=self.config["discount_factor"], _lambda=self.config["gae_lambda"], ) value_loss = advantage ** 2 avg_value_loss = value_loss.mean() n_action_probabilities = torch.softmax(n_action_scores, dim=2) n_action_distribution = torch.distributions.Categorical(n_action_probabilities) log_a=torch.distributions.Categorical(actions_probabilities).log_prob(actions) log_na=n_action_distribution.log_prob(actions) ratios=torch.exp(log_na-log_a) surr1 = ratios * advantage surr2 = torch.clamp(ratios,1-self.config["eps_clip"],1-self.config["eps_clip"])*advantage ppo_loss = torch.min(surr1,surr2) avg_ppo_loss = ppo_loss.mean() entropy_loss = n_action_distribution.entropy() avg_entropy_loss = entropy_loss.mean() dt = DictTensor( { "entropy_loss": avg_entropy_loss, "ppo_loss": avg_ppo_loss, "value_loss": avg_value_loss, } ) return dt
def get_single(self, slots, position): assert isinstance(slots, list) assert isinstance(slots[0], int) idx = torch.tensor(slots).to(self._device).long() d = {k: self.buffers[k][idx, position] for k in self.buffers} return DictTensor(d)
def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.q1.to(device) self.q2.to(device) self.target_q1.to(device) self.target_q2.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) optimizer_q1 = torch.optim.Adam( self.q1.parameters(), lr=self.config["lr"] ) optimizer_q2 = torch.optim.Adam( self.q2.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)})) logging.info("Sampling initial transitions") n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"])) for k in range(n_iterations): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) print("replay_buffer_size = ",self.replay_buffer.size()) n_episodes=self.config["n_evaluation_rollouts"] stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes) self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["n_batches_per_epochs"]): transitions=self.replay_buffer.sample(n=self.config["size_batches"]) #print(dt) dt,transitions = self.get_q_loss(transitions,device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] optimizer_q1.zero_grad() dt["q1_loss"].backward() optimizer_q1.step() optimizer_q2.zero_grad() dt["q2_loss"].backward() optimizer_q2.step() optimizer.zero_grad() dt = self.get_policy_loss(transitions) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] dt["policy_loss"].backward() optimizer.step() tau=self.config["tau"] self.soft_update_params(self.q1,self.target_q1,tau) self.soft_update_params(self.q2,self.target_q2,tau) self.iteration+=1 self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate()
def get(self): with torch.no_grad(): obs,is_running=self.env.reset(self.env_info) n_elems=obs.n_elems() observations=[{k:obs[k] for k in obs.keys()}] states=[] agent_state=None agent_info=self.agent_info if agent_info is None: agent_info=DictTensor({}) t=0 length=torch.zeros(is_running.size()[0]).long() first_state=None first_info=agent_info while is_running.size()[0]>0: old_agent_state, agent_output, new_agent_state = self.agent( agent_state, obs,agent_info ) if (len(states)==0): first_state=old_agent_state s={k:old_agent_state[k] for k in old_agent_state.keys()} s={**s,**{k:agent_output[k] for k in agent_output.keys()}} s={**s,**{"_"+k:new_agent_state[k] for k in new_agent_state.keys()}} states.append(s) else: s={k:old_agent_state[k] for k in old_agent_state.keys()} s={**s,**{k:agent_output[k] for k in agent_output.keys()}} s={**s,**{"_"+k:new_agent_state[k] for k in new_agent_state.keys()}} ns={k:states[0][k].clone() for k in states[0]} for k in states[0]: ns[k][is_running]=(s[k]) states.append(ns) (l_o,l_is_running),(obs,is_running)=self.env.step(agent_output) for k in l_o.keys(): observations[t]["_"+k]=observations[0][k].clone() for k in l_o.keys(): observations[t]["_"+k][l_is_running]=(l_o[k]) length[l_is_running]+=1 t+=1 if (is_running.size()[0]>0): observations.append({}) for k in obs.keys(): observations[t][k]=observations[0][k].clone() for k in obs.keys(): observations[t][k][is_running]=(obs[k]) ag={k:first_state[k].clone() for k in first_state.keys()} for k in ag: ag[k][l_is_running]=new_agent_state[k] agent_state=DictTensor({k:ag[k][is_running] for k in ag}) ai={k:first_info[k].clone() for k in first_info.keys()} agent_info=DictTensor({k:ai[k][is_running] for k in ai}) f_observations={} for k in observations[0]: _all=[o[k].unsqueeze(1) for o in observations] f_observations[k]=torch.cat(_all,dim=1) f_states={} for k in states[0]: _all=[o[k].unsqueeze(1) for o in states] f_states[k]=torch.cat(_all,dim=1) return TemporalDictTensor({**f_observations,**f_states},lengths=length)