def get(self,blocking=True): if not blocking: for w in range(len(self.workers)): b=self.workers[w].finished() if not b: return None max_length = 0 buffer_slot_id_lists = [] for w in range(len(self.workers)): if self.n_per_worker[w] > 0: sid_lists = self.workers[w].get() for sids in sid_lists: buffer_slot_id_lists.append(sids) max_length = max(max_length, len(sids)) if max_length > 1: if self.warning == False: logging.info( "================== EpisodeBatcher: trajectories over" + " multiple slots => may slow down the acquisition process" ) self.warning = True return self.buffer.get_multiple_slots(buffer_slot_id_lists, erase=True) else: buffer_slot_ids = [i[0] for i in buffer_slot_id_lists] return self.buffer.get_single_slots(buffer_slot_ids, erase=True)
def __init__( self, n_timesteps, n_slots, create_agent, agent_args, create_env, env_args, n_threads, seeds=None, ): # Buffer creation: agent = create_agent(**agent_args) env = create_env(**{**env_args,"seed":0}) obs,who=env.reset() with torch.no_grad(): a,b,c=agent(None,obs) self.n_envs=env.n_envs() specs_agent_state=a.specs() specs_agent_output=b.specs() specs_environment=obs.specs() del env del agent self.buffer = LocalBuffer( n_slots=n_slots, s_slots=n_timesteps, specs_agent_state=specs_agent_state, specs_agent_output=specs_agent_output, specs_environment=specs_environment, ) self.workers = [] self.n_per_worker = [] self.warning = False if seeds is None: logging.info( "Seeds for batcher environments has not been chosen. Default" + " is None" ) seeds = [None for k in range(n_threads)] if (isinstance(seeds,int)): s=seeds seeds=[s+k*64 for k in range(n_threads)] assert len(seeds)==n_threads,"You have to choose one seed per thread" logging.info("[EpisodeBatcher] Creating %d threads" % (n_threads)) for k in range(n_threads): e_args = {**env_args, "seed": seeds[k]} worker = ThreadWorker( len(self.workers), create_agent, agent_args, create_env, e_args, self.buffer, ) self.workers.append(worker)
def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.target_model.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]})) logging.info("Sampling initial transitions") for k in range(self.config["initial_buffer_epochs"]): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) n_episodes=self.config["n_evaluation_rollouts"] self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["qvalue_epochs"]): optimizer.zero_grad() dt = self.get_loss(device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] floss=dt["q_loss"] floss.backward() if self.config["clip_grad"] > 0: n = torch.nn.utils.clip_grad_norm_( self.learning_model.parameters(), self.config["clip_grad"] ) self.logger.add_scalar("grad_norm", n.item(), self.iteration) self.iteration+=1 optimizer.step() tau=self.config["tau"] self.soft_update_params(self.learning_model,self.target_model,tau) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate() self.iteration+=1
def run(self): self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"]) device = torch.device(self.config["learner_device"]) self.learning_model.to(device) self.q1.to(device) self.q2.to(device) self.target_q1.to(device) self.target_q2.to(device) optimizer = torch.optim.Adam( self.learning_model.parameters(), lr=self.config["lr"] ) optimizer_q1 = torch.optim.Adam( self.q1.parameters(), lr=self.config["lr"] ) optimizer_q2 = torch.optim.Adam( self.q2.parameters(), lr=self.config["lr"] ) self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) n_episodes=self.config["n_envs"]*self.config["n_threads"] self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)})) logging.info("Sampling initial transitions") n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"])) for k in range(n_iterations): self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) print("replay_buffer_size = ",self.replay_buffer.size()) n_episodes=self.config["n_evaluation_rollouts"] stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes) self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes) logging.info("Starting Learning") _start_time=time.time() logging.info("Learning") while time.time()-_start_time <self.config["time_limit"]: self.train_batcher.execute() trajectories=self.train_batcher.get() self.replay_buffer.push(trajectories) self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration) # avg_reward = 0 for k in range(self.config["n_batches_per_epochs"]): transitions=self.replay_buffer.sample(n=self.config["size_batches"]) #print(dt) dt,transitions = self.get_q_loss(transitions,device) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] optimizer_q1.zero_grad() dt["q1_loss"].backward() optimizer_q1.step() optimizer_q2.zero_grad() dt["q2_loss"].backward() optimizer_q2.step() optimizer.zero_grad() dt = self.get_policy_loss(transitions) [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()] dt["policy_loss"].backward() optimizer.step() tau=self.config["tau"] self.soft_update_params(self.q1,self.target_q1,tau) self.soft_update_params(self.q2,self.target_q2,tau) self.iteration+=1 self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu"))) self.evaluate()