Beispiel #1
0
 def initialize(self, env_spaces, share_memory=False,
         global_B=1, env_ranks=None):
     super().initialize(env_spaces, share_memory,
         global_B=global_B, env_ranks=env_ranks)
     self.target_model = self.ModelCls(**self.env_model_kwargs,
         **self.model_kwargs)
     self.target_model.load_state_dict(self.model.state_dict())
     self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
     if env_ranks is not None:
         self.make_vec_eps(global_B, env_ranks)
Beispiel #2
0
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent):
    def __call__(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        return q.cpu()

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)
        self.target_model = self.ModelCls(**self.env_model_kwargs,
                                          **self.model_kwargs)
        self.target_model.load_state_dict(self.model.state_dict())
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)

    def to_device(self, cuda_idx=None):
        super().to_device(cuda_idx)
        self.target_model.to(self.device)

    def state_dict(self):
        return dict(model=self.model.state_dict(),
                    target=self.target_model.state_dict())

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        q = q.cpu()
        action = self.distribution.sample(q)
        agent_info = AgentInfo(q=q)
        # action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def target(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        target_q = self.target_model(*model_inputs)
        return target_q.cpu()

    def update_target(self, tau=1):
        update_state_dict(self.target_model, self.model.state_dict(), tau)
Beispiel #3
0
    def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None):
        """Along with standard initialization, creates vector-valued epsilon
        for exploration, if applicable, with a different epsilon for each
        environment instance."""
        self.model = self.ModelCls(
            image_shape=env_spaces.observation.shape,
            action_size=env_spaces.action.n,
            **self.model_kwargs
        )
        if self.load_conv:
            logger.log("Agent loading state dict: " + self.state_dict_filename)
            loaded_state_dict = torch.load(
                self.state_dict_filename, map_location=torch.device("cpu")
            )
            # From UL, saves snapshot: params["algo_state_dict"]["encoder"]
            loaded_state_dict = loaded_state_dict.get(
                "algo_state_dict", loaded_state_dict
            )
            loaded_state_dict = loaded_state_dict.get("encoder", loaded_state_dict)
            # A bit onerous, but ensures that state dicts match:
            conv_state_dict = OrderedDict(
                [
                    (k.replace("conv.", "", 1), v)
                    for k, v in loaded_state_dict.items()
                    if k.startswith("conv.")
                ]
            )
            self.model.conv.load_state_dict(conv_state_dict)
            logger.log("Agent loaded CONV state dict.")
        elif self.load_all:
            # From RL, saves snapshot: params["agent_state_dict"]
            loaded_state_dict = torch.load(
                self.state_dict_filename, map_location=torch.device("cpu")
            )
            self.load_state_dict(loaded_state_dict["agent_state_dict"])
            logger.log("Agnet loaded FULL state dict.")
        else:
            logger.log("Agent NOT loading state dict.")

        self.target_model = copy.deepcopy(self.model)
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)
        if share_memory:
            self.model.share_memory()
            self.shared_model = self.model
        if self.initial_model_state_dict is not None:
            raise NotImplementedError
        self.env_spaces = env_spaces
        self.share_memory = share_memory
Beispiel #4
0
    def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None):
        """
        初始化agent。这个函数在Sampler类(例如SerialSampler)中的 initialize() 里会被调用。

        :param env_spaces: 参考 Env.spaces(),类型为 EnvSpaces 这样一个 namedtuple,包含observation space 和 action space两个属性。
        :param share_memory: 为 True 时使得模型参数可以在多进程间共享,为 False 时不共享。
        :param global_B: 在BatchSpec中,表示独立的trajectory的数量,即environment实例的数量。这里的global_B可能是指所有env的总数
        :param env_ranks: 其含义参考我写的文章 https://www.codelast.com/?p=10932
        """
        super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks)
        self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs)  # torch.nn.Module的子类
        self.target_model.load_state_dict(self.model.state_dict())  # 加载PyTorch模型,开始的时候target network和main network一致
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)  # 按ε-greedy方法来探索,n是action的维度
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)
Beispiel #5
0
 def initialize(self, env_spaces, share_memory=False):
     env_model_kwargs = self.make_env_to_model_kwargs(env_spaces)
     self.model = self.ModelCls(**env_model_kwargs, **self.model_kwargs)
     if share_memory:
         self.model.share_memory()
         self.shared_model = self.model
     if self.initial_model_state_dict is not None:
         self.model.load_state_dict(self.initial_model_state_dict)
     self.target_model = self.ModelCls(**env_model_kwargs,
                                       **self.model_kwargs)
     self.target_model.load_state_dict(self.model.state_dict())
     self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
     self.env_spaces = env_spaces
     self.env_model_kwargs = env_model_kwargs
     self.share_memory = share_memory
     super().initialize(env_spaces, share_memory)
Beispiel #6
0
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     """Along with standard initialization, creates vector-valued epsilon
     for exploration, if applicable, with a different epsilon for each
     environment instance."""
     super().initialize(env_spaces,
                        share_memory,
                        global_B=global_B,
                        env_ranks=env_ranks)
     self.target_model = self.ModelCls(**self.env_model_kwargs,
                                       **self.model_kwargs)
     self.target_model.load_state_dict(self.model.state_dict())
     self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
     if env_ranks is not None:
         self.make_vec_eps(global_B, env_ranks)
Beispiel #7
0
class AtariDqnAgent(EpsilonGreedyAgentMixin, BaseAgent):
    """
    Standard agent for DQN algorithms with epsilon-greedy exploration.  
    """
    def __init__(self,
                 ModelCls=AtariDqnModel,
                 model_kwargs=None,
                 load_conv=False,
                 load_all=False,
                 state_dict_filename=None,
                 store_latent=False,
                 **kwargs):
        if model_kwargs is None:
            model_kwargs = dict()
        assert not (load_conv and load_all)
        save__init__args(locals())
        super().__init__(ModelCls=ModelCls, **kwargs)

    def __call__(self, observation, prev_action, prev_reward):
        """Returns Q-values for states/observations (with grad)."""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q, _conv = self.model(*model_inputs)
        return q.cpu()

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        """Along with standard initialization, creates vector-valued epsilon
        for exploration, if applicable, with a different epsilon for each
        environment instance."""
        self.model = self.ModelCls(image_shape=env_spaces.observation.shape,
                                   action_size=env_spaces.action.n,
                                   **self.model_kwargs)
        if self.load_conv:
            logger.log("Agent loading state dict: " + self.state_dict_filename)
            loaded_state_dict = torch.load(self.state_dict_filename,
                                           map_location=torch.device('cpu'))
            # From UL, saves snapshot: params["algo_state_dict"]["encoder"]
            loaded_state_dict = loaded_state_dict.get("algo_state_dict",
                                                      loaded_state_dict)
            loaded_state_dict = loaded_state_dict.get("encoder",
                                                      loaded_state_dict)
            # A bit onerous, but ensures that state dicts match:
            conv_state_dict = OrderedDict([
                (k.replace("conv.", "", 1), v)
                for k, v in loaded_state_dict.items() if k.startswith("conv.")
            ])
            self.model.conv.load_state_dict(conv_state_dict)
            logger.log("Agent loaded CONV state dict.")
        elif self.load_all:
            # From RL, saves snapshot: params["agent_state_dict"]
            loaded_state_dict = torch.load(self.state_dict_filename,
                                           map_location=torch.device('cpu'))
            self.load_state_dict(loaded_state_dict["agent_state_dict"])
            logger.log("Agnet loaded FULL state dict.")
        else:
            logger.log("Agent NOT loading state dict.")

        self.target_model = copy.deepcopy(self.model)
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)
        if share_memory:
            self.model.share_memory()
            self.shared_model = self.model
        if self.initial_model_state_dict is not None:
            raise NotImplementedError
        self.env_spaces = env_spaces
        self.share_memory = share_memory

    def to_device(self, cuda_idx=None):
        super().to_device(cuda_idx)
        self.target_model.to(self.device)

    def state_dict(self):
        return dict(model=self.model.state_dict(),
                    target=self.target_model.state_dict())

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        """Computes Q-values for states/observations and selects actions by
        epsilon-greedy. (no grad)"""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q, conv = self.model(*model_inputs)
        q = q.cpu()
        action = self.distribution.sample(q)
        agent_info = AgentInfoConv(q=q,
                                   conv=conv if self.store_latent else None)
        # action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def target(self, observation, prev_action, prev_reward):
        """Returns the target Q-values for states/observations."""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        target_q, _conv = self.target_model(*model_inputs)
        return target_q.cpu()

    def update_target(self, tau=1):
        """Copies the model parameters into the target model."""
        update_state_dict(self.target_model, self.model.state_dict(), tau)
Beispiel #8
0
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent):

    def __call__(self, observation, prev_action, prev_reward):
        """
        __call__使得一个class可以像一个method一样调用,即:假设agent为DqnAgent的一个对象,那么agent(observation, prev_action,
        prev_reward)就等同于调用agent.__call__(observation, prev_action, prev_reward)
        """
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device)
        q = self.model(*model_inputs)  # torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor)
        return q.cpu()  # 将tensor移动到CPU(内存)

    def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None):
        """
        初始化agent。这个函数在Sampler类(例如SerialSampler)中的 initialize() 里会被调用。

        :param env_spaces: 参考 Env.spaces(),类型为 EnvSpaces 这样一个 namedtuple,包含observation space 和 action space两个属性。
        :param share_memory: 为 True 时使得模型参数可以在多进程间共享,为 False 时不共享。
        :param global_B: 在BatchSpec中,表示独立的trajectory的数量,即environment实例的数量。这里的global_B可能是指所有env的总数
        :param env_ranks: 其含义参考我写的文章 https://www.codelast.com/?p=10932
        """
        super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks)
        self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs)  # torch.nn.Module的子类
        self.target_model.load_state_dict(self.model.state_dict())  # 加载PyTorch模型,开始的时候target network和main network一致
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)  # 按ε-greedy方法来探索,n是action的维度
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)

    def to_device(self, cuda_idx=None):
        """
        指定把模型数据(parameter和buffer)放在什么设备上(CPU/GPU)。
        父类是指定self.model的数据放在哪个GPU上,在本子类中是指定self.target_model的数据放在哪。

        :param cuda_idx: GPU编号
        """
        super().to_device(cuda_idx)
        self.target_model.to(self.device)  # self.device在初始化的时候已经写死了是CPU,因此这里指定在CPU上运行

    def state_dict(self):
        """
        返回main network和target network两个网络的state数据。例如网络的weight,bias等。

        :return: 一个dict。
        """
        return dict(model=self.model.state_dict(), target=self.target_model.state_dict())

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        """
        在environment中走一步。environment类(例如AtariEnv也有一个step(),那个step()和这里的step()的主要区别是:这里的step()根据policy
        network选取了一个action,而AtariEnv里的step()输入的action是已经选取好的action,并且AtariEnv的step()会计算reward,记录一些统计
        信息等,这里的step()不会去计算reward。
        这个函数在Collector类的collect_batch()函数中会被调用。
        这里会发生policy network的前向传播过程(比较耗计算资源的操作),即根据输入(例如observation)计算下一步要采取的action。

        :param observation: 其义自明。
        :param prev_action: 前一个action。
        :param prev_reward: 之前累积的reward。
        :return: 要采取的action(类型为torch.Tensor),以及agent的信息(例如Q值)
        """
        prev_action = self.distribution.to_onehot(prev_action)  # 返回类型为 torch.Tensor
        model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device)  # 策略网络的输入(torch.Tensor)
        q = self.model(*model_inputs)  # self.model是torch.nn.Module的子类对象,这里是输入特征计算网络的输出,因此会发生NN的forward过程
        q = q.cpu()  # 把tensor移到CPU(内存),返回torch.Tensor
        action = self.distribution.sample(q)  # 选择一个action(torch.Tensor)
        agent_info = AgentInfo(q=q)
        # action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def target(self, observation, prev_action, prev_reward):
        """
        计算Q值。

        :param observation: 如其名。
        :param prev_action: 前一个action。
        :param prev_reward: 前一个reward。
        :return: CPU(内存)里的Q值对应的Tensor。
        """
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device)
        # 计算Q值,self.target_model是一个torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor)
        target_q = self.target_model(*model_inputs)
        return target_q.cpu()  # 将tensor移动到CPU(内存)

    def update_target(self, tau=1):
        """
        更新target network,即把main network(self.model)的参数拷贝到target network(self.target_model)上。
        当τ>0的时候会使用soft update算法来更新参数。
        为了保持learning过程的稳定性以及高效性,target network不是实时更新,而是周期性地更新一次。例如,在DQN.optimize_agent()函数中,
        会看到每隔一定的周期才会调用一次本函数的代码逻辑。

        :param tau: soft update算法里的τ参数。
        """
        update_state_dict(self.target_model, self.model.state_dict(), tau)
Beispiel #9
0
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent):
    """
    Standard agent for DQN algorithms with epsilon-greedy exploration.  
    """
    def __call__(self, observation, prev_action, prev_reward):
        """Returns Q-values for states/observations (with grad)."""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        return q.cpu()

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        """Along with standard initialization, creates vector-valued epsilon
        for exploration, if applicable, with a different epsilon for each
        environment instance."""
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)
        self.target_model = self.ModelCls(**self.env_model_kwargs,
                                          **self.model_kwargs)
        self.target_model.load_state_dict(self.model.state_dict())
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
        if env_ranks is not None:
            self.make_vec_eps(global_B, env_ranks)

    def to_device(self, cuda_idx=None):
        super().to_device(cuda_idx)
        self.target_model.to(self.device)

    def state_dict(self):
        return dict(model=self.model.state_dict(),
                    target=self.target_model.state_dict())

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        """Computes Q-values for states/observations and selects actions by
        epsilon-greedy. (no grad)"""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        q = q.cpu()
        action = self.distribution.sample(q)
        agent_info = AgentInfo(q=q)
        # action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def target(self, observation, prev_action, prev_reward):
        """Returns the target Q-values for states/observations."""
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        target_q = self.target_model(*model_inputs)
        return target_q.cpu()

    def update_target(self, tau=1):
        """Copies the model parameters into the target model."""
        update_state_dict(self.target_model, self.model.state_dict(), tau)
Beispiel #10
0
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent):
    def __call__(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        return q.cpu()

    def initialize(self, env_spaces, share_memory=False):
        env_model_kwargs = self.make_env_to_model_kwargs(env_spaces)
        self.model = self.ModelCls(**env_model_kwargs, **self.model_kwargs)
        if share_memory:
            self.model.share_memory()
            self.shared_model = self.model
        if self.initial_model_state_dict is not None:
            self.model.load_state_dict(self.initial_model_state_dict)
        self.target_model = self.ModelCls(**env_model_kwargs,
                                          **self.model_kwargs)
        self.target_model.load_state_dict(self.model.state_dict())
        self.distribution = EpsilonGreedy(dim=env_spaces.action.n)
        self.env_spaces = env_spaces
        self.env_model_kwargs = env_model_kwargs
        self.share_memory = share_memory
        super().initialize(env_spaces, share_memory)

    def initialize_cuda(self, cuda_idx=None, ddp=False):
        if cuda_idx is None:
            return  # CPU
        if self.shared_model is not None:
            self.model = self.ModelCls(**self.env_model_kwargs,
                                       **self.model_kwargs)
            self.model.load_state_dict(self.shared_model.state_dict())
        self.device = torch.device("cuda", index=cuda_idx)
        self.model.to(self.device)
        if ddp:
            self.model = DDP(self.model,
                             device_ids=[cuda_idx],
                             output_device=cuda_idx)
            logger.log("Initialized DistributedDataParallel agent model "
                       f"on device: {self.device}.")
        else:
            logger.log(f"Initialized agent model on device: {self.device}.")
        self.target_model.to(self.device)

    def make_env_to_model_kwargs(self, env_spaces):
        raise NotImplementedError

    def state_dict(self):
        return dict(model=self.model.state_dict(),
                    target=self.target_model.state_dict())

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        q = self.model(*model_inputs)
        q = q.cpu()
        action = self.distribution.sample(q)
        agent_info = AgentInfo(q=q)
        # action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def target(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        target_q = self.target_model(*model_inputs)
        return target_q.cpu()

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())