Python TFLogger Examples

Programming Language: Python

Namespace/Package Name: rlstructures.logger

Class/Type: TFLogger

Examples at hotexamples.com: 5

Python TFLogger - 5 examples found. These are the top rated real world Python examples of rlstructures.logger.TFLogger extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

close(3)

TFLogger(2)

add_scalar(2)

update_csv(2)

Example #1

Show file

File: template_exp.py Project: XrosLiang/rlstructures

 def __init__(self, config, create_env, create_agent):
     assert self.check_arguments(config)
     self.config = config
     self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)
     self.batchers = []
     self._create_env = create_env
     self._create_agent = create_agent

Example #2

Show file

File: a2c.py Project: XrosLiang/rlstructures

    def __init__(self, config,create_env,create_agent):
        self.config = config

        # Creation of the Logger (that saves in tensorboard and CSV)
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)

        self._create_env=create_env
        self._create_agent=create_agent

        #Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(self.config["n_envs"], seed=0,env_name=self.config["env_name"])
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

Example #3

Show file

File: template_exp.py Project: XrosLiang/rlstructures

class BaseExperiment:
    def __init__(self, config, create_env, create_agent):
        assert self.check_arguments(config)
        self.config = config
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)
        self.batchers = []
        self._create_env = create_env
        self._create_agent = create_agent

    def check_arguments(self, arguments):
        """
        The function aims at checking that the arguments (provided in config) are the good ones
        """
        return True

    def register_batcher(self, batcher):
        """
        Register a new batcher when you create one, to ensure a correct closing of the experiment
        """
        self.batchers.append(batcher)

    def _create_model(self):
        # self.learning_model = ......
        raise NotImplementedError

    def create_model(self):
        self.learning_model = self._create_model()
        self.iteration = 0

    def reset(self):
        raise NotImplementedError

    def run(self):
        raise NotImplementedError

    def terminate(self):
        for b in self.batchers:
            b.close()
        self.logger.close()

    def go(self):
        self.create_model()
        self.reset()
        self.run()
        self.terminate()

Example #4

Show file

File: a2c.py Project: XrosLiang/rlstructures

class A2C:
    def __init__(self, config,create_env,create_train_env,create_agent):
        self.config = config

        # Creation of the Logger (that saves in tensorboard and CSV)
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)

        self._create_env=create_env
        self._create_train_env=create_train_env
        self._create_agent=create_agent

        #Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(self.config["n_envs"], seed=0,env_name=self.config["env_name"])
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model=AgentModel(self.obs_dim,self.n_actions,32)
        self.critic_model=BaselineModel(self.obs_dim,32)

        #We create a batcher dedicated to evaluation
        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])],
        )

        #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["a2c_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_train_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"])

        #Training Loop:
        _start_time=time.time()
        self.iteration=0

        # #We launch the evaluation batcher (in deterministic mode)
        n_episodes=self.config["n_evaluation_episodes"]
        agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info)
        self.evaluation_iteration=self.iteration

        #Initialize the training batcher such that agents will start to acqire pieces of episodes
        self.train_batcher.update(self.learning_model.state_dict())
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)})
        self.train_batcher.reset(agent_info=agent_info)

        while(time.time()-_start_time<self.config["time_limit"]):
            #Call the batcher to get a sample of trajectories

            #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode
            self.train_batcher.execute()
            trajectories=self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt=self.get_loss(trajectories)
            [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss= floss/n_episodes*trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration+=1

            #We check the evaluation batcher
            evaluation_trajectories=self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None: #trajectories are available
                #Compute the cumulated reward
                cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration)
                print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item()))
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration=self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get() # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv() # To save as a CSV file in logdir
        self.logger.close()


    def get_loss(self,trajectories):
            #First, we want to compute the cumulated reward per trajectory
            #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
            # The 'reward' field corresopnds to the reward at time t
            reward=trajectories["_reward"]

            #We get the mask that tells which transition is in a trajectory (1) or not (0)
            mask=trajectories.mask()

            #We remove the reward values that are not in the trajectories
            reward=reward*mask
            max_length=trajectories.lengths.max().item()
            #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
            action_probabilities=[]
            for t in range(max_length):
                proba=self.learning_model(trajectories["frame"][:,t])
                action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension)
            action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor

            #We compute the critic value for t=0 to T (i.e including the very last observation)
            critic=[]
            for t in range(max_length):
                b=self.critic_model(trajectories["frame"][:,t])
                critic.append(b.unsqueeze(1))
            critic=torch.cat(critic+[b.unsqueeze(1)],dim=1).squeeze(-1) #Now, we have a B x (T+1) tensor
            #We also need to compute the critic value at for the last observation of the trajectories (to compute the TD)
            # It may be the last element of the trajectories (if episode is not finished), or on the last frame of the episode
            idx=torch.arange(trajectories.n_elems())
            last_critic=self.critic_model(trajectories["_frame"][idx,trajectories.lengths-1]).squeeze(-1)
            critic[idx,trajectories.lengths]=last_critic


            #We compute the temporal difference
            target=reward+self.config["discount_factor"]*(1-trajectories["_done"].float())*critic[:,1:].detach()
            td=critic[:,:-1]-target

            critic_loss=td**2
            #We sum the loss for each episode (considering the mask)
            critic_loss= (critic_loss*mask).sum(1)/mask.sum(1)
            #We average the loss over all the trajectories
            avg_critic_loss = critic_loss.mean()

            #We do the same on the reinforce loss
            action_distribution=torch.distributions.Categorical(action_probabilities)
            log_proba=action_distribution.log_prob(trajectories["action"])
            a2c_loss = -log_proba * td.detach()
            a2c_loss = (a2c_loss*mask).sum(1)/mask.sum(1)
            avg_a2c_loss=a2c_loss.mean()

            #We compute the entropy loss
            entropy=action_distribution.entropy()
            entropy=(entropy*mask).sum(1)/mask.sum(1)
            avg_entropy=entropy.mean()

            return DictTensor({"critic_loss":avg_critic_loss,"a2c_loss":avg_a2c_loss,"entropy_loss":avg_entropy})

Example #5

Show file

class Reinforce:
    def __init__(self, config, create_env, create_agent):
        self.config = config

        # Creation of the Logger (that saves in tensorboard and CSV)
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)

        self._create_env = create_env
        self._create_agent = create_agent

        #Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(self.config["n_envs"],
                               seed=0,
                               env_name=self.config["env_name"])
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16)
        self.baseline_model = BaselineModel(self.obs_dim, 16)

        #We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_evaluation_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_threads"])
            ],
        )

        #Creation of the batcher for sampling complete episodes (i.e Episode Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_envs"] * self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(
            self.learning_model, self.baseline_model).parameters(),
                                     lr=self.config["lr"])

        #Training Loop:
        _start_time = time.time()
        self.iteration = 0

        #We launch the evaluation batcher (in deterministic mode)
        n_episodes = self.config["n_evaluation_episodes"]
        agent_info = DictTensor(
            {"stochastic": torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,
                                        agent_info=agent_info)
        self.evaluation_iteration = self.iteration

        while (time.time() - _start_time < self.config["time_limit"]):
            #Update the batcher with the last version of the learning model
            self.train_batcher.update(self.learning_model.state_dict())

            #Call the batcher to get a sample of trajectories
            #1) The policy will be executed in "stochastic' mode
            n_episodes = self.config["n_envs"] * self.config["n_threads"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)})
            self.train_batcher.execute(n_episodes=n_episodes,
                                       agent_info=agent_info)

            #2) We get the trajectories (and wait until the trajectories have been sampled)
            trajectories = self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [
                self.logger.add_scalar(k, dt[k].item(), self.iteration)
                for k in dt.keys()
            ]

            # Computation of final loss
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            print("At iteration %d, avg (discounted) reward is %f" %
                  (self.iteration, dt["avg_reward"].item()))
            print("\t Avg trajectory length is %f" %
                  (trajectories.lengths.float().mean().item()))
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'" %
                self.config["logdir"])
            self.iteration += 1

            #We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(
                blocking=False)
            if not evaluation_trajectories is None:  #trajectories are available
                #Compute the cumulated reward
                cumulated_reward = (
                    evaluation_trajectories["_reward"] *
                    evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",
                                       cumulated_reward.item(),
                                       self.evaluation_iteration)
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(
                    self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        #First, we want to compute the cumulated reward per trajectory
        #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        #We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        #We remove the reward values that are not in the trajectories
        reward = reward * mask

        #We compute the future cumulated reward at each timestep (by reverse computation)
        max_length = trajectories.lengths.max().item()
        cumulated_reward = torch.zeros_like(reward)
        cumulated_reward[:, max_length - 1] = reward[:, max_length - 1]
        for t in range(max_length - 2, -1, -1):
            cumulated_reward[:, t] = reward[:, t] + self.config[
                "discount_factor"] * cumulated_reward[:, t + 1]

        #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
        for t in range(max_length):
            proba = self.learning_model(trajectories["frame"][:, t])
            action_probabilities.append(
                proba.unsqueeze(1)
            )  # We append the probability, and introduces the temporal dimension (2nde dimension)
        action_probabilities = torch.cat(
            action_probabilities,
            dim=1)  #Now, we have a B x T x n_actions tensor

        #We compute the baseline
        baseline = []
        for t in range(max_length):
            b = self.baseline_model(trajectories["frame"][:, t])
            baseline.append(b.unsqueeze(1))
        baseline = torch.cat(baseline,
                             dim=1).squeeze(-1)  #Now, we have a B x T tensor

        #We compute the baseline loss
        baseline_loss = (baseline - cumulated_reward)**2
        #We sum the loss for each episode (considering the mask)
        baseline_loss = (baseline_loss * mask).sum(1) / mask.sum(1)
        #We average the loss over all the trajectories
        avg_baseline_loss = baseline_loss.mean()

        #We do the same on the reinforce loss
        action_distribution = torch.distributions.Categorical(
            action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action"])
        reinforce_loss = log_proba * (cumulated_reward - baseline).detach()
        reinforce_loss = (reinforce_loss * mask).sum(1) / mask.sum(1)
        avg_reinforce_loss = reinforce_loss.mean()

        #We compute the entropy loss
        entropy = action_distribution.entropy()
        entropy = (entropy * mask).sum(1) / mask.sum(1)
        avg_entropy = entropy.mean()

        return DictTensor({
            "avg_reward": cumulated_reward[:, 0].mean(),
            "baseline_loss": avg_baseline_loss,
            "reinforce_loss": avg_reinforce_loss,
            "entropy_loss": avg_entropy
        })