Ejemplo n.º 1
0
 async def adapt(self,
                 train_futures,
                 first_order=None,
                 args=None,
                 inner=None):
     if first_order is None:
         first_order = self.first_order
     # Loop over the number of steps of adaptation
     params = None
     for futures in train_futures:
         inner_loss = reinforce_loss(self.policy,
                                     await futures,
                                     params=params)
         if inner:
             params = self.policy.update_params_inner(
                 inner_loss,
                 params=params,
                 step_size=self.fast_lr,
                 first_order=first_order,
                 args=args)
         else:
             params = self.policy.update_params_outer(
                 inner_loss,
                 params=params,
                 step_size=self.fast_lr,
                 first_order=first_order)
     return params
Ejemplo n.º 2
0
    def sample(self,index, num_steps=1, fast_lr=0.5, gamma=0.95, gae_lambda=1.0, device='cpu'):
        # Sample the training trajectories with the initial policy and adapt the
        # policy to the task, based on the REINFORCE loss computed on the
        # training trajectories. The gradient update in the fast adaptation uses
        # `first_order=True` no matter if the second order version of MAML is
        # applied since this is only used for sampling trajectories, and not
        # for optimization.
        params = None
        for step in range(num_steps):
            train_episodes = self.create_episodes(params=params,
                                                  gamma=gamma,
                                                  gae_lambda=gae_lambda,
                                                  device=device)
            train_episodes.log('_enqueueAt', datetime.now(timezone.utc))
            # QKFIX: Deep copy the episodes before sending them to their
            # respective queues, to avoid a race condition. This issue would 
            # cause the policy pi = policy(observations) to be miscomputed for
            # some timesteps, which in turns makes the loss explode.
            self.train_queue.put((index, step, deepcopy(train_episodes)))

            with self.policy_lock:
                loss = reinforce_loss(self.policy, train_episodes, params=params)
                params = self.policy.update_params(loss,
                                                   params=params,
                                                   step_size=fast_lr,
                                                   first_order=True)

        # Sample the validation trajectories with the adapted policy
        valid_episodes = self.create_episodes(params=params,
                                              gamma=gamma,
                                              gae_lambda=gae_lambda,
                                              device=device)
        valid_episodes.log('_enqueueAt', datetime.now(timezone.utc))
        self.valid_queue.put((index, None, deepcopy(valid_episodes)))
Ejemplo n.º 3
0
    async def adapt(self, train_futures, first_order=None):
        if first_order is None:
            first_order = self.first_order
        # Loop over the number of steps of adaptation 循环调整的步数
        params = None
        # await后面调用future对象,中断当前程序直到得到 futures 的返回值
        # 等待 futures 计算完成,再进行计算 reinforce_loss

        params_show_maml_trpo = self.policy.state_dict()

        for futures in train_futures:
            inner_loss = reinforce_loss(self.policy,
                                        await futures,
                                        params=params)
            # 计算更新后参数,好像不传输到网络中?  self.policy.state_dict()仍然为参数
            params = self.policy.update_params(inner_loss,
                                               params=params,
                                               step_size=self.fast_lr,
                                               first_order=first_order)

            params_show_maml_trpo_test = self.policy.state_dict()

        return params
Ejemplo n.º 4
0
for step in range(500): # 10 is num_steps
    train_episodes = worker.create_episodes(params=params,
                                            gamma=0.99,
                                            gae_lambda=1.0,
                                            device='cpu')

    rewards.append(train_episodes.rewards.mean().item())
    # train_episodes.log('_enqueueAt', datetime.now(timezone.utc))
    # QKFIX: Deep copy the episodes before sending them to their
    # respective queues, to avoid a race condition. This issue would 
    # cause the policy pi = policy(observations) to be miscomputed for
    # some timesteps, which in turns makes the loss explode.
    # self.train_queue.put((index, step, deepcopy(train_episodes)))
    
    # with self.policy_lock:
    loss = reinforce_loss(worker.policy, train_episodes, params=params)
    params = worker.policy.update_params(loss,
                                        params=params,
                                        step_size=3e-2,
                                        first_order=True)

plt.plot(list(range(len(rewards))), rewards)
plt.show()

pos = train_episodes.observations[::,1]
pos_x = pos[::,0]
pos_y = pos[::,1]

plt.plot(pos_x, pos_y)
plt.scatter([1], [1], c="r")
plt.scatter(pos_x, pos_y, c=list(range(len(pos_x))), cmap="Greens")
Ejemplo n.º 5
0
    def sample(self,
               index,
               num_steps=1,
               fast_lr=0.5,
               gamma=0.95,
               gae_lambda=1.0,
               device='cpu'):
        """
        基于初始策略采样训练轨迹,并基于REINFORCE损失调整策略
        内循环中,梯度更新使用`first_order=True`,因其仅用于采样轨迹,而不是优化
        Sample the training trajectories with the initial policy and adapt the
        policy to the task, based on the REINFORCE loss computed on the
        training trajectories. The gradient update in the fast adaptation uses
        `first_order=True` no matter if the second order version of MAML is
        applied since this is only used for sampling trajectories, and not
        for optimization.
        """

        """
        训练阶段:
            采样训练轨迹数据 train_episodes,计算loss,更新原有网络参数
            采样验证轨迹数据 valid_episodes
        MAML 内部循环更新num_steps次 inner loop / fast adaptation
        """
        # 此处参数设置为 None,调用 OrderDict() 参数
        params = None

        params_show_multi_task_sampler = self.policy.state_dict()

        for step in range(num_steps):
            # 获取该batch中所有的轨迹数据,将数据保存至 train_episodes
            train_episodes = self.create_episodes(params=params,
                                                  gamma=gamma,
                                                  gae_lambda=gae_lambda,
                                                  device=device)
            train_episodes.log('_enqueueAt', datetime.now(timezone.utc))
            # QKFIX: Deep copy the episodes before sending them to their
            # respective queues, to avoid a race condition. This issue would 
            # cause the policy pi = policy(observations) to be miscomputed for
            # some timesteps, which in turns makes the loss explode.
            self.train_queue.put((index, step, deepcopy(train_episodes)))

            """
                计算 reinforce loss, 更新网络参数 params
            """
            # 多线程程序中,安全使用可变对象
            # with + lock:保证每次只有一个线程执行下面代码块
            # with 语句会在这个代码块执行前自动获取锁,在执行结束后自动释放锁
            with self.policy_lock:
                loss = reinforce_loss(self.policy, train_episodes, params=params)
                params = self.policy.update_params(loss,
                                                   params=params,
                                                   step_size=fast_lr,
                                                   first_order=True)

                params_show_multi_task_sampler_test = self.policy.state_dict()

        # Sample the validation trajectories with the adapted policy
        valid_episodes = self.create_episodes(params=params,
                                              gamma=gamma,
                                              gae_lambda=gae_lambda,
                                              device=device)
        valid_episodes.log('_enqueueAt', datetime.now(timezone.utc))
        self.valid_queue.put((index, None, deepcopy(valid_episodes)))
Ejemplo n.º 6
0
    def sample(self,
               task=None,
               num_steps=1,
               fast_lr=0.5,
               gamma=0.95,
               gae_lambda=1.0,
               device='cpu'):
        """
        基于初始策略采样训练轨迹,并基于REINFORCE损失调整策略
        内循环中,梯度更新使用`first_order=True`,因其仅用于采样轨迹,而不是优化
        Sample the training trajectories with the initial policy and adapt the
        policy to the task, based on the REINFORCE loss computed on the
        training trajectories. The gradient update in the fast adaptation uses
        `first_order=True` no matter if the second order version of MAML is
        applied since this is only used for sampling trajectories, and not
        for optimization.
        """
        """
        训练阶段:
            采样训练轨迹数据 train_episodes,计算loss,更新原有网络参数
            采样验证轨迹数据 valid_episodes
        MAML 内部循环更新num_steps次 inner loop / fast adaptation
        """

        self.env.reset_task(task)

        # # 此处参数设置为 None,调用 OrderDict() 参数
        # params = None
        #
        # params_show_multi_task_sampler = self.policy.state_dict()
        # train_episodes = []
        # 先采样训练阶段数据轨迹
        for step in range(num_steps):
            # 获取该batch中所有的轨迹数据,将数据保存至 train_episodes
            # for i in range(self.batch_size):
            train_episodes = self.create_episodes(gamma=gamma,
                                                  gae_lambda=gae_lambda,
                                                  device=device)

            train_episodes.log('_enqueueAt', datetime.now(timezone.utc))
            # QKFIX: Deep copy the episodes before sending them to their
            # respective queues, to avoid a race condition. This issue would
            # cause the policy pi = policy(observations) to be miscomputed for
            # some timesteps, which in turns makes the loss explode.
            """
                计算 reinforce loss, 更新网络参数 params
            """

            # 多线程程序中,安全使用可变对象
            # with + lock:保证每次只有一个线程执行下面代码块
            # with 语句会在这个代码块执行前自动获取锁,在执行结束后自动释放锁
            train_loss = reinforce_loss(self.policy, train_episodes)
            train_loss = train_loss.mean()
            lr = 1e-3
            self.policy.train()
            optimizer = optim.Adam(self.policy.parameters(), lr)
            # Take gradient step:
            # 计算梯度 已经
            grad_step(train_loss, optimizer)

            # params_show_multi_task_sampler_test = self.policy.state_dict()

        # Sample the validation trajectories with the adapted policy
        valid_episodes = self.create_episodes(gamma=gamma,
                                              gae_lambda=gae_lambda,
                                              device=device)
        valid_loss = reinforce_loss(self.policy, valid_episodes)
        valid_episodes.log('_enqueueAt', datetime.now(timezone.utc))

        return train_episodes, train_loss, valid_episodes, valid_loss