def get(self,blocking=True):
        if not blocking:
            for w in range(len(self.workers)):
                b=self.workers[w].finished()
                if not b:
                    return None

        max_length = 0
        buffer_slot_id_lists = []
        for w in range(len(self.workers)):
            if self.n_per_worker[w] > 0:
                sid_lists = self.workers[w].get()
                for sids in sid_lists:
                    buffer_slot_id_lists.append(sids)
                    max_length = max(max_length, len(sids))
        if max_length > 1:
            if self.warning == False:
                logging.info(
                    "================== EpisodeBatcher: trajectories over"
                    + " multiple slots => may slow down the acquisition process"
                )
                self.warning = True
            return self.buffer.get_multiple_slots(buffer_slot_id_lists, erase=True)
        else:
            buffer_slot_ids = [i[0] for i in buffer_slot_id_lists]
            return self.buffer.get_single_slots(buffer_slot_ids, erase=True)
    def __init__(
        self,
        n_timesteps,
        n_slots,
        create_agent,
        agent_args,
        create_env,
        env_args,
        n_threads,
        seeds=None,
    ):
        # Buffer creation:
        agent = create_agent(**agent_args)
        env = create_env(**{**env_args,"seed":0})
        obs,who=env.reset()
        with torch.no_grad():
            a,b,c=agent(None,obs)

        self.n_envs=env.n_envs()
        specs_agent_state=a.specs()
        specs_agent_output=b.specs()
        specs_environment=obs.specs()
        del env
        del agent

        self.buffer = LocalBuffer(
            n_slots=n_slots,
            s_slots=n_timesteps,
            specs_agent_state=specs_agent_state,
            specs_agent_output=specs_agent_output,
            specs_environment=specs_environment,
        )
        self.workers = []
        self.n_per_worker = []
        self.warning = False

        if seeds is None:
            logging.info(
                "Seeds for batcher environments has not been chosen. Default"
                + " is None"
            )
            seeds = [None for k in range(n_threads)]

        if (isinstance(seeds,int)):
            s=seeds
            seeds=[s+k*64 for k in range(n_threads)]
        assert len(seeds)==n_threads,"You have to choose one seed per thread"
        logging.info("[EpisodeBatcher] Creating %d threads" % (n_threads))
        for k in range(n_threads):
            e_args = {**env_args, "seed": seeds[k]}
            worker = ThreadWorker(
                len(self.workers),
                create_agent,
                agent_args,
                create_env,
                e_args,
                self.buffer,
            )
            self.workers.append(worker)
Beispiel #3
0
    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        self.target_model.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        
        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]}))
        logging.info("Sampling initial transitions")
        for k in range(self.config["initial_buffer_epochs"]):
            self.train_batcher.execute()        
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        
        n_episodes=self.config["n_evaluation_rollouts"]
        self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes)
        
        logging.info("Starting Learning")
        _start_time=time.time()
        
        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0
           
            for k in range(self.config["qvalue_epochs"]):
                optimizer.zero_grad()
                dt = self.get_loss(device)
                
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                
                floss=dt["q_loss"]
                floss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                self.iteration+=1
                optimizer.step()
            
                tau=self.config["tau"]
                self.soft_update_params(self.learning_model,self.target_model,tau)
                


            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))            
            self.evaluate()
            self.iteration+=1
Beispiel #4
0
    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)

        self.q1.to(device)
        self.q2.to(device)
        self.target_q1.to(device)
        self.target_q2.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        optimizer_q1 = torch.optim.Adam(
            self.q1.parameters(), lr=self.config["lr"]
        )
        optimizer_q2 = torch.optim.Adam(
            self.q2.parameters(), lr=self.config["lr"]
        )

        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)}))
        logging.info("Sampling initial transitions")
        n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"]))
        for k in range(n_iterations):
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        print("replay_buffer_size = ",self.replay_buffer.size())

        n_episodes=self.config["n_evaluation_rollouts"]
        stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes)
        self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes)

        logging.info("Starting Learning")
        _start_time=time.time()

        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0

            for k in range(self.config["n_batches_per_epochs"]):
                transitions=self.replay_buffer.sample(n=self.config["size_batches"])

                #print(dt)
                dt,transitions = self.get_q_loss(transitions,device)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                optimizer_q1.zero_grad()
                dt["q1_loss"].backward()
                optimizer_q1.step()

                optimizer_q2.zero_grad()
                dt["q2_loss"].backward()
                optimizer_q2.step()

                optimizer.zero_grad()
                dt = self.get_policy_loss(transitions)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                dt["policy_loss"].backward()
                optimizer.step()

                tau=self.config["tau"]
                self.soft_update_params(self.q1,self.target_q1,tau)
                self.soft_update_params(self.q2,self.target_q2,tau)

                self.iteration+=1

            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
            self.evaluate()