Example #1
0
    def update(self):
        self.ac.train()
        data = self.buffer.get()
        obs_ = data['obs']
        act_ = data['act']
        ret_ = data['ret']
        adv_ = data['adv']
        logp_old_ = data['logp']

        for index in BatchSampler(
                SubsetRandomSampler(range(self.steps_per_epoch)),
                self.batch_size, False):
            obs = obs_[index]
            act = act_[index]
            ret = ret_[index]
            adv = adv_[index]
            logp_old = logp_old_[index]

            # ---------------------Recording the losses before the updates --------------------------------
            pi, logp = self.ac.pi(obs, act)
            ratio = torch.exp(logp - logp_old)
            clipped_adv = torch.clamp(ratio, 1 - self.clip_ratio,
                                      1 + self.clip_ratio) * adv
            loss_pi = -torch.min(ratio * adv, clipped_adv).mean()
            v = self.ac.v(obs)
            loss_v = ((v - ret)**2).mean()

            self.logger.store(LossV=loss_v.item(), LossPi=loss_pi.item())
            # --------------------------------------------------------------------------------------------

            # Update Policy
            for i in range(self.train_pi_iters):
                # Policy loss
                self.pi_optimizer.zero_grad()
                pi, logp = self.ac.pi(obs, act)
                ratio = torch.exp(logp - logp_old)
                clipped_adv = torch.clamp(ratio, 1 - self.clip_ratio,
                                          1 + self.clip_ratio) * adv
                loss_pi = -torch.min(ratio * adv, clipped_adv).mean()
                approx_kl = (logp - logp_old).mean().item()
                if approx_kl > 1.5 * self.target_kl:
                    print(f"Early stopping at step {i} due to reaching max kl")
                    break
                loss_pi.backward()
                self.pi_optimizer.step()

            # Update Value Function
            for _ in range(self.train_v_iters):
                self.v_optimizer.zero_grad()
                v = self.ac.v(obs)
                loss_v = ((v - ret)**2).mean()
                loss_v.backward()
                self.v_optimizer.step()
Example #2
0
    def update(self, i_ep):
        #从buffer中读取state,action和reward
        state = torch.tensor([t.state for t in self.buffer], dtype=torch.float)
        action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1)
        reward = [t.reward for t in self.buffer]
        #旧的动作概率来自buffer,buffer相当于是theta‘,因为它来自于以前的actor,根据这个获得新的theta和actor
        old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1)

        R = 0
        Gt = []

        for r in reward[::-1]:  #倒序遍历
            R = r + gamma * R  #最后获得episode的总reward
            Gt.insert(0, R)  #insert(索引值,内容)

        Gt = torch.tensor(Gt, dtype=torch.float)
        for i in range(self.ppo_update_time):
            #在buffer中取batch_size的sample,然后用index遍历
            for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False):
                if self.training_step % 1000 == 0:
                    print('I_ep {}, train {} times'.format(i_ep, self.training_step))

                Gt_index = Gt[index].view(-1, 1)
                V = self.critic_net(state[index])  #计算状态评分
                delta = Gt_index - V  #实际与估计的差值, advantage function
                advantage = delta.detach()  #截断梯度反向传播

                action_prob = self.actor_net(state[index]).gather(1, action[index])  #生成概率,也来自actor

                ratio = (action_prob / old_action_log_prob[index])  #生成的概率与buffer中存的概率之比,PPO的原理
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage

                #update actor network
                action_loss = -torch.min(surr1, surr2).mean()  #max -> min descend
                self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step)
                self.actor_optimizer.zero_grad()
                action_loss.backward()
                nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)  #防止梯度消失
                self.actor_optimizer.step()

                #update critic network
                value_loss = F.mse_loss(Gt_index, V)
                self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step)
                self.critic_net_optimizer.zero_grad()
                value_loss.backward()
                nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)  #梯度裁剪,避免梯度消失
                self.critic_net_optimizer.step()

                self.training_step += 1

        del self.buffer[:]  #clear experience
    def update(self, i_ep):
        state = torch.tensor([t.state for t in self.buffer], dtype=torch.float32)
        action = torch.tensor([t.action for t in self.buffer], dtype=torch.long)
        reward = [t.reward for t in self.buffer]
        # update: don't need next_state
        # reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1)
        # next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float)
        old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float32).view(-1, 1)

        R = 0
        Gt = []
        for r in reward[::-1]:
            R = r + gamma * R
            Gt.insert(0, R)
        Gt = torch.tensor(Gt, dtype=torch.float32)
        print("The agent is updateing....")
        for i in range(self.ppo_update_time):
            for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False):
                '''
                if self.training_step % 100 ==0:
                    print('I_ep {} ,train {} times'.format(i_ep,self.training_step))
                '''
                # with torch.no_grad():
                Gt_index = Gt[index].view(-1, 1)
                V = self.critic_net(state[index])
                delta = Gt_index - V
                advantage = delta.detach()
                # epoch iteration, PPO core!!!
                action_prob = torch.gather(self.actor_net(state[index]), 2, action[index, :].unsqueeze(-1))
                action_prob = torch.prod(action_prob.squeeze(), dim=-1)
                ratio = (action_prob / old_action_log_prob.squeeze()[index])
                surr1 = ratio * advantage.squeeze()
                surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage

                # update actor network
                action_loss = -torch.min(surr1, surr2).mean()  # MAX->MIN desent
                self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step)
                self.actor_optimizer.zero_grad()
                action_loss.backward()
                nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
                self.actor_optimizer.step()

                # update critic network
                value_loss = F.mse_loss(Gt_index, V)
                self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step)
                self.critic_net_optimizer.zero_grad()
                value_loss.backward()
                nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
                self.critic_net_optimizer.step()
                self.training_step += 1

        del self.buffer[:]  # clear experience, on policy!
Example #4
0
    def feed_forward_generator(self, advantages, num_mini_batch):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps
        assert batch_size >= num_mini_batch, (
            "PPO requires the number of processes ({}) "
            "* number of steps ({}) = {} "
            "to be greater than or equal to the number of PPO mini batches ({})."
            "".format(num_processes, num_steps, num_processes * num_steps,
                      num_mini_batch))
        mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=False)
        for indices in sampler:
            obs_im_batch = self.obs_im[:-1].view(
                -1,
                *self.obs_im.size()[2:])[indices]
            if self.encoder_type == "rgb+map":
                obs_sm_batch = self.obs_sm[:-1].view(
                    -1,
                    *self.obs_sm.size()[2:])[indices]
                obs_lm_batch = self.obs_lm[:-1].view(
                    -1,
                    *self.obs_lm.size()[2:])[indices]
            else:
                obs_sm_batch = None
                obs_lm_batch = None
            recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
                -1, self.recurrent_hidden_states.size(-1))[indices]
            actions_batch = self.actions.view(-1,
                                              self.actions.size(-1))[indices]
            value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            collisions_batch = self.collisions[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(-1,
                                                                    1)[indices]
            adv_targ = advantages.view(-1, 1)[indices]

            yield (
                obs_im_batch,
                obs_sm_batch,
                obs_lm_batch,
                obs_recurrent_hidden_states_batch,
                actions_batch,
                value_preds_batch,
                return_batch,
                masks_batch,
                collisions_batch,
                old_action_log_probs_batch,
                adv_targ,
            )
Example #5
0
def ppo_update_stage1(policy, optimizer, batch_size, memory, epoch,
               coeff_entropy=0.02, clip_value=0.2,
               num_step=2048, num_env=12, frames=1, obs_size=24, act_size=4):
    obss, goals, speeds, actions, logprobs, targets, values, rewards, advs = memory

    advs = (advs - advs.mean()) / advs.std()

    obss = obss.reshape((num_step*num_env, frames, obs_size))
    goals = goals.reshape((num_step*num_env, 2))
    speeds = speeds.reshape((num_step*num_env, 2))
    actions = actions.reshape(num_step*num_env, act_size)
    logprobs = logprobs.reshape(num_step*num_env, 1)
    advs = advs.reshape(num_step*num_env, 1)
    targets = targets.reshape(num_step*num_env, 1)

    for update in range(epoch):
        sampler = BatchSampler(SubsetRandomSampler(list(range(advs.shape[0]))), batch_size=batch_size,
                               drop_last=False)
        for i, index in enumerate(sampler):
            sampled_obs = Variable(torch.from_numpy(obss[index])).float().cuda()
            sampled_goals = Variable(torch.from_numpy(goals[index])).float().cuda()
            sampled_speeds = Variable(torch.from_numpy(speeds[index])).float().cuda()

            sampled_actions = Variable(torch.from_numpy(actions[index])).float().cuda()
            sampled_logprobs = Variable(torch.from_numpy(logprobs[index])).float().cuda()
            sampled_targets = Variable(torch.from_numpy(targets[index])).float().cuda()
            sampled_advs = Variable(torch.from_numpy(advs[index])).float().cuda()


            new_value, new_logprob, dist_entropy = policy.evaluate_actions(sampled_obs, sampled_goals, sampled_speeds, sampled_actions)

            sampled_logprobs = sampled_logprobs.view(-1, 1)
            ratio = torch.exp(new_logprob - sampled_logprobs)

            sampled_advs = sampled_advs.view(-1, 1)
            surrogate1 = ratio * sampled_advs
            surrogate2 = torch.clamp(ratio, 1 - clip_value, 1 + clip_value) * sampled_advs
            policy_loss = -torch.min(surrogate1, surrogate2).mean()

            sampled_targets = sampled_targets.view(-1, 1)
            value_loss = F.mse_loss(new_value, sampled_targets)

            loss = policy_loss + 20 * value_loss - coeff_entropy * dist_entropy
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            info_p_loss, info_v_loss, info_entropy = float(policy_loss.detach().cpu().numpy()), \
                                                     float(value_loss.detach().cpu().numpy()), float(
                                                    dist_entropy.detach().cpu().numpy())
            logger_ppo.info('{}, {}, {}'.format(info_p_loss, info_v_loss, info_entropy))

    print('update')
Example #6
0
    def feed_forward_generator(self,
                               advantages,
                               num_mini_batch=None,
                               mini_batch_size=None):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps

        if mini_batch_size is None:
            assert batch_size >= num_mini_batch, (
                "PPO requires the number of processes ({}) "
                "* number of steps ({}) = {} "
                "to be greater than or equal to the number of PPO mini batches ({})."
                "".format(num_processes, num_steps, num_processes * num_steps,
                          num_mini_batch))
            mini_batch_size = batch_size // num_mini_batch

        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=True)

        for indices in sampler:
            obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
            actions_batch = self.actions.view(-1,
                                              self.actions.size(-1))[indices]
            if self.args.recurrent_policy:
                recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
                    -1, self.recurrent_hidden_states.size(-1))[indices]
            else:
                recurrent_hidden_states_batch = None

            if self.add_input.shape[-1] > 0:
                add_input_batch = self.add_input[:-1].view(
                    -1,
                    *self.add_input.size()[2:])[indices]
            else:
                add_input_batch = None

            value_preds_batch = self.value_preds[:-1].view(
                -1, self.value_dim)[indices]
            return_batch = self.returns[:-1].view(-1, self.value_dim)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(
                -1, self.value_dim)[indices]
            if advantages is None:
                adv_targ = None
            else:
                adv_targ = advantages.view(-1, self.value_dim)[indices]

            yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
                value_preds_batch, return_batch, masks_batch, \
                old_action_log_probs_batch, adv_targ, \
                add_input_batch
Example #7
0
def ppo_update(policy,
               optimizer,
               batch_size,
               memory,
               nupdates,
               coeff_entropy=0.02,
               clip_value=0.2,
               writer=None):
    obs, actions, logprobs, returns, values = memory
    advantages = returns - values
    advantages = (advantages - advantages.mean()) / advantages.std()

    for update in range(nupdates):
        sampler = BatchSampler(SubsetRandomSampler(
            list(range(advantages.shape[0]))),
                               batch_size=batch_size,
                               drop_last=False)
        for i, index in enumerate(sampler):
            sampled_obs = Variable(torch.from_numpy(obs[index])).float().cuda()
            sampled_actions = Variable(torch.from_numpy(
                actions[index])).float().cuda()
            sampled_logprobs = Variable(torch.from_numpy(
                logprobs[index])).float().cuda()
            sampled_returns = Variable(torch.from_numpy(
                returns[index])).float().cuda()
            sampled_advs = Variable(torch.from_numpy(
                advantages[index])).float().cuda()

            new_value, new_logprob, dist_entropy = policy.evaluate_actions(
                sampled_obs, sampled_actions)

            sampled_logprobs = sampled_logprobs.view(-1, 1)
            ratio = torch.exp(new_logprob - sampled_logprobs)

            sampled_advs = sampled_advs.view(-1, 1)
            surrogate1 = ratio * sampled_advs
            surrogate2 = torch.clamp(ratio, 1 - clip_value,
                                     1 + clip_value) * sampled_advs
            policy_loss = -torch.min(surrogate1, surrogate2).mean()

            sampled_returns = sampled_returns.view(-1, 1)
            value_loss = F.mse_loss(new_value, sampled_returns)

            loss = policy_loss + value_loss - coeff_entropy * dist_entropy
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if writer is not None:
                writer.add_scalar('ppo/value_loss', value_loss.data[0])
                writer.add_scalar('ppo/policy_loss', policy_loss.data[0])
                writer.add_scalar('ppo/entropy', dist_entropy.data[0])
    return value_loss.data[0], policy_loss.data[0], dist_entropy.data[0]
Example #8
0
 def __init__(self,
              num_samples,
              num_items,
              batch_size,
              drop_last,
              update_callback,
              replacement=True):
     self.num_items = num_items
     weighted_sampler = WeightedRandomSampler(torch.ones(num_items),
                                              num_samples, replacement)
     self.sampler = BatchSampler(weighted_sampler, batch_size, drop_last)
     self.update_callback = update_callback
     self.update_callback.connect_sampler(self)
Example #9
0
 def get_generator(self, batch_size=1024):
     iterator = BatchSampler(SubsetRandomSampler(
         range(self.num_steps * self.num_envs)),
                             batch_size,
                             drop_last=True)
     for indices in iterator:
         obs = self.obs[:-1].reshape(-1, *self.obs_shape)[indices].cuda()
         action = self.action.reshape(-1)[indices].cuda()
         log_prob = self.log_prob.reshape(-1)[indices].cuda()
         value = self.value[:-1].reshape(-1)[indices].cuda()
         returns = self.returns.reshape(-1)[indices].cuda()
         advantage = self.advantage.reshape(-1)[indices].cuda()
         yield obs, action, log_prob, value, returns, advantage
Example #10
0
def create_dataloader(dataset: Dataset,
                      batch_size: int,
                      bptt: int,
                      rand: bool) -> DataLoader:
    return DataLoader(dataset,
                      batch_sampler=BatchSampler(
                          SkipSampler(dataset,
                                      batch_size=batch_size,
                                      bptt=bptt,
                                      rand=rand),
                          batch_size=batch_size * (bptt + 1),
                          drop_last=True),
                      collate_fn=collate_batch(batch_size))
def test_oom_batch_sampler():
    data = list(range(-4, 14))
    sampler = SequentialSampler(data)
    batch_sampler = BatchSampler(sampler, 4, False)
    oom_sampler = OomBatchSampler(batch_sampler,
                                  lambda i: data[i],
                                  num_batches=3)
    list_ = list(oom_sampler)
    # The largest batches are first
    assert [data[i] for i in list_[0]] == [8, 9, 10, 11]
    assert [data[i] for i in list_[1]] == [12, 13]
    assert [data[i] for i in list_[2]] == [4, 5, 6, 7]
    assert len(list_) == 5
Example #12
0
 def getBaseSampler(self, type, batchSize, offset):
     """Get base sampler."""
     if type == "samespeaker":
         return SameSpeakerSampler(batchSize, self.speakerLabel,
                                   self.sizeWindow, offset)
     if type == "samesequence":
         return SameSpeakerSampler(batchSize, self.seqLabel,
                                   self.sizeWindow, offset)
     if type == "sequential":
         return SequentialSampler(len(self.data), self.sizeWindow, offset,
                                  batchSize)
     sampler = UniformAudioSampler(len(self.data), self.sizeWindow, offset)
     return BatchSampler(sampler, batchSize, True)
Example #13
0
 def sample(self, states, actions, log_probs, returns, batch_size):
     """
     Converts a sample of size batch_size to multiple mini batches of
     size num_M
     """
     states = torch.cat(states)
     actions = torch.cat(actions)
     log_probs = torch.cat(log_probs)
     returns = torch.cat(returns)
     sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), self.num_mini_batch, drop_last=True)
     for indices in sampler:
         # print(indices)
         yield states[indices], actions[indices], log_probs[indices], returns[indices]
Example #14
0
    def __init__(self, config, name):
        super(BoxesDataLoader, self).__init__(
            dataset=boxes_dataset.ImagesWithBoxesDataset(config=config, name=name),
            batch_size=config['data_loader']['batch_size_%s' % name],
            drop_last=config['data_loader']['drop_last']
        )
        self.batch_sampler = BatchSampler(
            SequentialSampler(self.dataset),
            batch_size=self.batch_size,
            drop_last=self.drop_last
        )

        self.config = config
Example #15
0
    def getBaseSampler(self, samplingType, batchSize, offset):
        if samplingType == "samecategory":
            return SameTrackSampler(batchSize, self.categoryLabel,
                                    self.sizeWindow, offset)
        if samplingType == "samesequence":
            return SameTrackSampler(batchSize, self.seqLabel, self.sizeWindow,
                                    offset)
        if samplingType == "sequential":
            return SequentialSampler(len(self.data), self.sizeWindow, offset,
                                     batchSize)

        sampler = UniformAudioSampler(len(self.data), self.sizeWindow, offset)
        return BatchSampler(sampler, batchSize, True)
Example #16
0
def get_training_loader(training_dataset, params):
    if params.seqtrain:
        train_sampler = SequentialSampler([i for i in range(len(training_dataset))])
    else:
        train_sampler = SubsetRandomSampler([i for i in range(len(training_dataset))])

    training_dataloader = DataLoader(training_dataset, batch_size=None,
                                     shuffle=False, num_workers=0,
                                     sampler=BatchSampler(train_sampler,
                                                          batch_size=params.batch_size,
                                                          drop_last=True))

    return training_dataloader
Example #17
0
 def fit(self, dataset, batch_size, learning_rate, iterations, shuffle=True, *args, **kwargs):
     optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
     batch_sampler = BatchSampler(
         sampler=self.get_base_sampler(len(dataset), shuffle), batch_size=batch_size, drop_last=False)
     batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations=iterations, start_iter=0)
     loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler)
     for step, (data, targets) in tqdm.tqdm(
             enumerate(loader), disable=self.logger.level > 15, total=len(loader)):
         self.model.zero_grad()
         prediction = self.model(data)
         loss = self.criterion(prediction, targets)
         loss.backward()
         optimizer.step()
Example #18
0
 def _get_sampler(self, storage):
     obs = storage.get_def_obs_seq()
     ob_shape = rutils.get_obs_shape(self.policy.obs_space)
     self.agent_obs_pairs = {
         'state': obs[:-1].view(-1, *ob_shape),
         'next_state': obs[1:].view(-1, *ob_shape),
         'mask': storage.masks[:-1].view(-1, 1),
     }
     failure_sampler = BatchSampler(SubsetRandomSampler(
         range(self.args.num_steps)),
                                    self.args.traj_batch_size,
                                    drop_last=True)
     return self.expert_train_loader, failure_sampler
Example #19
0
    def feed_forward_generator(self,
                               advantages,
                               num_mini_batch=None,
                               mini_batch_size=None):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_steps * num_processes

        if mini_batch_size is None:
            assert batch_size >= num_mini_batch, (
                "PPO requires number of processes ({}) * number of steps ({}) = {} to be"
                "greater than or equal to number of PPO mini-batches ({}).".
                format(num_processes, num_steps, num_processes * num_steps,
                       num_mini_batch))
            mini_batch_size = batch_size // num_mini_batch

        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=True)

        for indices in sampler:
            obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
            recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
                -1, self.recurrent_hidden_states.size(-1))[indices]
            if self.multi_action_head:
                actions_batch = [
                    self.actions[i].view(-1, self.actions[i].size(-1))[indices]
                    for i in range(len(self.actions))
                ]
                actions_mask_batch = [
                    self.action_masks[i][:-1].view(
                        -1, self.action_masks[i].size(-1))[indices]
                    for i in range(len(self.action_masks))
                ]
            else:
                actions_batch = self.actions.view(
                    -1, self.actions.size(-1))[indices]
                actions_mask_batch = self.action_masks[:-1].view(
                    -1, self.action_masks.size(-1))[indices]
            value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(-1,
                                                                    1)[indices]

            if advantages is None:
                adv_targ = None
            else:
                adv_targ = advantages.view(-1, 1)[indices]

            yield obs_batch, recurrent_hidden_states_batch, actions_batch, actions_mask_batch, \
                value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ
Example #20
0
    def feed_forward_generator(self,
                               advantages,
                               num_mini_batch=None,
                               mini_batch_size=None):
        num_steps, num_processes = self.rewards_raw.size()[0:2]
        batch_size = num_processes * num_steps

        if mini_batch_size is None:
            assert batch_size >= num_mini_batch, (
                "PPO requires the number of processes ({}) "
                "* number of steps ({}) = {} "
                "to be greater than or equal to the number of PPO mini batches ({})."
                "".format(num_processes, num_steps, num_processes * num_steps,
                          num_mini_batch))
            mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=True)
        for indices in sampler:

            if self.normalise_observations:
                prev_obs = self.prev_obs_normalised
            else:
                prev_obs = self.prev_obs_raw

            obs_batch = prev_obs[:-1].reshape(-1,
                                              *prev_obs.size()[2:])[indices]
            actions_batch = self.actions.reshape(
                -1, self.actions.size(-1))[indices]

            if self.latent_dim is not None and self.latent_mean is not None:
                latent_sample_batch = torch.cat(
                    self.latent_samples[:-1])[indices]
                latent_mean_batch = torch.cat(self.latent_mean[:-1])[indices]
                latent_logvar_batch = torch.cat(
                    self.latent_logvar[:-1])[indices]
            else:
                latent_sample_batch = latent_mean_batch = latent_logvar_batch = None

            value_preds_batch = self.value_preds[:-1].reshape(-1, 1)[indices]
            return_batch = self.returns[:-1].reshape(-1, 1)[indices]

            old_action_log_probs_batch = self.action_log_probs.reshape(
                -1, 1)[indices]
            if advantages is None:
                adv_targ = None
            else:
                adv_targ = advantages.reshape(-1, 1)[indices]

            yield obs_batch, actions_batch, latent_sample_batch, latent_mean_batch, latent_logvar_batch, \
                  value_preds_batch, return_batch, old_action_log_probs_batch, adv_targ
Example #21
0
    def update(self):
        actions = torch.tensor([m.action for m in self.memory],
                               dtype=torch.long).view(-1, 1).to(self.device)
        rewards = [m.reward for m in self.memory]
        is_terminals = [m.done for m in self.memory]
        old_action_log_probs = torch.tensor(
            [m.a_log_prob for m in self.memory],
            dtype=torch.float).view(-1, 1).to(self.device)

        discounted_reward = 0
        mc_rewards = []
        for reward, is_terminal in zip(reversed(rewards),
                                       reversed(is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + self.gamma * discounted_reward
            mc_rewards.insert(0, discounted_reward)
        mc_rewards = torch.tensor(mc_rewards,
                                  dtype=torch.float).to(self.device)

        for kt in range(self.k_epochs):
            for index in BatchSampler(
                    SubsetRandomSampler(range(len(self.memory))),
                    self.batch_size, False):
                mc_rewards_index = mc_rewards[index].view(-1, 1)
                states = self.encode_sample(index)
                states_as_tensor = torch.from_numpy(states).float().to(
                    self.device)
                value_index = self.critic(states_as_tensor)
                delta = mc_rewards_index - value_index
                advantage = delta.detach()

                action_prob = self.actor(states_as_tensor).gather(
                    1, actions[index])

                ratio = (action_prob / old_action_log_probs[index])
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.eps_clip,
                                    1 + self.eps_clip) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()

                critic_loss = self.loss_func(mc_rewards_index, value_index)
                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

        del self.memory[:]
Example #22
0
    def train_ppo_epochs(self, epoch):
        """
        Does one series of ppo updates
        """

        # Samples a set of molecules of size self.episode_size (along with rewards, actions, etc)
        rewards, advantages, actions, old_log_probs, smiles = self.sample_and_process_episode(
        )
        sampler = BatchSampler(SubsetRandomSampler(range(self.episode_size)),
                               self.batch_size,
                               drop_last=False)

        # Randomly samples batches of size self.batch_size and then performs one ppo_update.
        # This is repeated self.ppo_epochs times.
        self.model.train()
        for indices in sampler:
            rewards_batch = rewards.view(-1, rewards.size(-1))[indices]
            actions_batch = actions.view(-1, actions.size(-1))[indices]
            old_log_probs_batch = old_log_probs.view(
                -1, old_log_probs.size(-1))[indices]
            smiles_batch = list(np.array(smiles)[indices])
            advantages_batch = advantages.view(-1,
                                               advantages.size(-1))[indices]

            log_probs_batch, values_batch, entropies_batch, kl_divs_batch = self.action_replay.replay(
                model=self.model, prior=self.prior, actions=actions_batch)

            ratio = torch.exp(log_probs_batch - old_log_probs_batch)
            surr1 = ratio * advantages_batch
            surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                1.0 + self.clip_param) * advantages_batch
            policy_loss = -torch.min(surr1,
                                     surr2).mean()  # standard ppo policy loss.

            value_loss = self._calculate_value_loss(smiles_batch, values_batch,
                                                    rewards_batch)

            entropy_loss = self._calculate_entropy_loss(
                smiles_batch, entropies_batch)

            kl_div_loss = self._calculate_kl_div_loss(smiles_batch,
                                                      kl_divs_batch)

            loss = policy_loss + value_loss + entropy_loss + kl_div_loss

            self.optimizer.zero_grad()
            loss.backward()

            self.optimizer.step()

        self._print_stats(epoch=epoch, smiles=smiles)
Example #23
0
    def feed_forward_generator(self, advantages, args):
        advantages = advantages.view([-1, 1])
        num_steps = self.rewards.size(0)
        num_training_per_episode = self.rewards.size(1)
        num_processes = self.rewards.size(2)
        num_total = num_training_per_episode * num_processes
        obs_shape = self.observations.shape[3:]
        action_shape = self.actions.shape[3:]
        state_size = self.states.size(3)

        batch_size = num_processes * num_steps
        mini_batch_size = batch_size // args.num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=False)

        # We reshape these so that the trajectories per agent instead look like new processes.
        observations = self.observations.view(
            [num_steps + 1, num_total, *obs_shape])
        states = self.states.view([num_steps + 1, num_total, state_size])
        rewards = self.rewards.view([num_steps, num_total, 1])
        value_preds = self.value_preds.view([num_steps + 1, num_total, 1])
        returns = self.returns.view([num_steps + 1, num_total, 1])
        actions = self.actions.view([num_steps, num_total, *action_shape])
        action_log_probs = self.action_log_probs.view(
            [num_steps, num_total, 1])
        masks = self.masks.view([num_steps + 1, num_total, 1])

        for indices in sampler:
            indices = torch.LongTensor(indices)

            if advantages.is_cuda:
                indices = indices.cuda()

            observations_batch = observations[:-1].contiguous().view(
                (args.num_steps * num_total),
                *observations.size()[2:])[indices]
            states_batch = states[:-1].contiguous().view(
                (args.num_steps * num_total), 1)[indices]
            actions_batch = actions.contiguous().view(
                (args.num_steps * num_total), 1)[indices]
            return_batch = returns[:-1].contiguous().view(
                (args.num_steps * num_total), 1)[indices]
            masks_batch = masks[:-1].contiguous().view(
                (args.num_steps * num_total), 1)[indices]
            old_action_log_probs_batch = action_log_probs.contiguous().view(
                (args.num_steps * num_total), 1)[indices]
            adv_targ = advantages.contiguous().view(-1, 1)[indices]

            yield observations_batch, states_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, adv_targ
    def __init__(self,
                 dataset,
                 batch_size=1,
                 shuffle=False,
                 sampler=None,
                 batch_sampler=None,
                 num_workers=0,
                 collate_fn=default_collate,
                 pin_memory=False,
                 drop_last=False,
                 timeout=0,
                 worker_init_fn=None):
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.collate_fn = collate_fn
        self.pin_memory = pin_memory
        self.drop_last = drop_last
        self.timeout = timeout
        self.worker_init_fn = worker_init_fn

        if timeout < 0:
            raise ValueError('timeout option should be non-negative')

        if batch_sampler is not None:
            if batch_size > 1 or shuffle or sampler is not None or drop_last:
                raise ValueError('batch_sampler option is mutually exclusive '
                                 'with batch_size, shuffle, sampler, and '
                                 'drop_last')
            self.batch_size = None
            self.drop_last = None

        if sampler is not None and shuffle:
            raise ValueError('sampler option is mutually exclusive with '
                             'shuffle')

        if self.num_workers < 0:
            raise ValueError('num_workers option cannot be negative; '
                             'use num_workers=0 to disable multiprocessing.')

        if batch_sampler is None:
            if sampler is None:
                if shuffle:
                    sampler = RandomSampler(dataset)
                else:
                    sampler = SequentialSampler(dataset)
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)

        self.sampler = sampler
        self.batch_sampler = batch_sampler
        self.__initialized = True
Example #25
0
    def update(self, num_episode):
        state = torch.tensor([t.state for t in self.memory], dtype=torch.float)
        action = torch.tensor([t.action for t in self.memory],
                              dtype=torch.long).view(-1, 1)
        reward = [t.reward for t in self.memory]
        last_action_log_prob = torch.tensor(
            [t.a_log_prob for t in self.memory],
            dtype=torch.float).view(-1, 1)

        R = 0
        Gt = []
        for r in reward[::-1]:
            R = r + self.gamma * R
            Gt.insert(0, R)
        Gt = torch.tensor(Gt, dtype=torch.float)
        for i in range(self.update_time):
            for index in BatchSampler(
                    SubsetRandomSampler(range(len(self.memory))),
                    self.batch_size, False):
                if self.training_step % 100 == 0:
                    print(
                        f'Episode #{num_episode}#, train #{self.training_step}# times.'
                    )
                Gt_index = Gt[index].view(-1, 1)
                state_values = self.critics(state[index])
                tmp_adv = Gt_index - state_values
                advantage = tmp_adv.detach()

                action_prob = self.actor(state[index]).gather(1, action[index])
                ratio = action_prob / last_action_log_prob[index]
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.clip_param,
                                    1 + self.clip_param) * advantage

                action_loss = -1 * torch.min(surr1, surr2).mean()
                self.actor_optimizer.zero_grad()
                action_loss.backward()
                nn.utils.clip_grad_norm(self.actor.parameters(),
                                        self.max_grad_norm)
                self.actor_optimizer.step()

                value_loss = F.mse_loss(Gt_index, state_values)
                self.critics_optimizer.zero_grad()
                value_loss.backward()
                nn.utils.clip_grad_norm(self.critics.parameters(),
                                        self.max_grad_norm)
                self.critics_optimizer.step()

                self.training_step += 1

        del self.memory[:]
Example #26
0
def load_images(folder, batch_size, k):
    train_filename = '{}/train.large.input.npy'.format(folder)
    valid_filename = '{}/val.input.npy'.format(folder)
    test_filename = '{}/test.input.npy'.format(folder)

    train_dataset = ImageDataset(train_filename)
    valid_dataset = ImageDataset(
        valid_filename, mean=train_dataset.mean,
        std=train_dataset.std)  # All features are normalized with mean and std
    test_dataset = ImageDataset(test_filename,
                                mean=train_dataset.mean,
                                std=train_dataset.std)

    train_data = DataLoader(train_dataset,
                            num_workers=1,
                            pin_memory=True,
                            batch_sampler=BatchSampler(ImagesSampler(
                                train_dataset, k, shuffle=True),
                                                       batch_size=batch_size,
                                                       drop_last=False))

    valid_data = DataLoader(valid_dataset,
                            num_workers=1,
                            pin_memory=True,
                            batch_sampler=BatchSampler(ImagesSampler(
                                valid_dataset, k, shuffle=False),
                                                       batch_size=batch_size,
                                                       drop_last=False))

    test_data = DataLoader(test_dataset,
                           num_workers=1,
                           pin_memory=True,
                           batch_sampler=BatchSampler(ImagesSampler(
                               test_dataset, k, shuffle=False),
                                                      batch_size=batch_size,
                                                      drop_last=False))

    return train_data, valid_data, test_data
Example #27
0
def load_images_smart(folder, batch_size, k):
	train_filename = '{}/train'.format(folder)
	valid_filename = '{}/val'.format(folder)
	test_filename = '{}/test'.format(folder)
	noise_filename = '{}/noise'.format(folder)
	train_dataset = ImageDatasetSmart(train_filename)
	valid_dataset = ImageDatasetSmart(valid_filename) # All features are normalized with mean and std
	test_dataset = ImageDatasetSmart(test_filename)
	noise_dataset = ImageDatasetSmart(noise_filename)

	train_data = DataLoader(train_dataset, num_workers=1, pin_memory=True,
		batch_sampler=BatchSampler(ImagesSampler(train_dataset, k, shuffle=True), batch_size=batch_size, drop_last=False))

	valid_data = DataLoader(valid_dataset, num_workers=1, pin_memory=True,
		batch_sampler=BatchSampler(ImagesSampler(valid_dataset, k, shuffle=False), batch_size=batch_size, drop_last=False))

	test_data = DataLoader(test_dataset, num_workers=1, pin_memory=True,
		batch_sampler=BatchSampler(ImagesSampler(test_dataset, k, shuffle=False), batch_size=batch_size, drop_last=False))

	noise_data = DataLoader(noise_dataset, num_workers=1, pin_memory=True,
		batch_sampler=BatchSampler(ImagesSampler(noise_dataset, k, shuffle=False), batch_size=batch_size, drop_last=False))

	return train_data, valid_data, test_data, noise_data
Example #28
0
def create_data_loaders(dataconfig, args):
    assert_exactly_one([args.exclude_unlabeled, args.labeled_batch_size])

    meta = cdc.ASVSpoof19Meta(data_dir=dataconfig['root'],
                              meta_dir=dataconfig['processed_meta'],
                              folds_num=1,  # default
                              attack_type=dataconfig['attack_type'])

    fl_train = meta.fold_list(fold=1, data_split=cdc.ASVSpoof19Meta.DataSplit.train)
    dataset = cdd.ArkDataGenerator(data_file=dataconfig['feat_storage'],
                                   fold_list=fl_train,
                                   transform=dataconfig['train_trans'],
                                   rand_slides=True)

    if args.labels:
        with open(args.labels) as f:
            labels = dict(line.split('\t') for line in f.read().splitlines())
        labeled_idxs, unlabeled_idxs = data.relabel_dataset(dataset, labels)

    if args.exclude_unlabeled:
        sampler = SubsetRandomSampler(labeled_idxs)
        batch_sampler = BatchSampler(sampler, args.batch_size, drop_last=True)
    elif args.labeled_batch_size:
        batch_sampler = data.TwoStreamBatchSampler(
            unlabeled_idxs, labeled_idxs, args.batch_size, args.labeled_batch_size)
    else:
        assert False, "labeled batch size {}".format(args.labeled_batch_size)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_sampler=batch_sampler,
                                               num_workers=args.workers,
                                               pin_memory=True)

    #####
    fl_eval = meta.fold_list(fold=1, data_split=cdc.ASVSpoof19Meta.DataSplit.validation)  # TODO note val == train
    eval_data = cdd.ArkDataGenerator(data_file=dataconfig['feat_storage'],
                                     fold_list=fl_eval,
                                     transform=dataconfig['eval_trans'],
                                     rand_slides=True)
    #####

    eval_loader = torch.utils.data.DataLoader(
        eval_data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=2 * args.workers,  # Needs images twice as fast
        pin_memory=True,
        drop_last=False)

    return train_loader, eval_loader
Example #29
0
 def __init__(self,
              data,
              batch_size,
              drop_last=False,
              sort_key='utr_len',
              bucket_size_multiplier=100):
     self.data = data
     self.sampler = Basic_sampler(data)
     super().__init__(self.sampler, batch_size, drop_last)
     self.sort_key = sort_key
     _bucket_size = batch_size * bucket_size_multiplier
     if hasattr(self.sampler, "__len__"):
         _bucket_size = min(_bucket_size, len(self.sampler))
     self.bucket_sampler = BatchSampler(self.sampler, _bucket_size, False)
Example #30
0
def test_IterationBasedBatchSampler():
    from torch.utils.data.sampler import SequentialSampler, BatchSampler
    sampler = SequentialSampler([i for i in range(10)])
    batch_sampler = BatchSampler(sampler, batch_size=2, drop_last=True)
    batch_sampler = IterationBasedBatchSampler(batch_sampler, 5)

    # check __len__
    assert len(batch_sampler) == 5
    for i, index in enumerate(batch_sampler):
        assert [i * 2, i * 2 + 1] == index

    # check start iter
    batch_sampler.start_iter = 2
    assert len(batch_sampler) == 3