Example #1
0
  def play_episode(self,episode:int):
    state = self.env.reset()
    previous_x = None
    episode_actions = torch.empty(size=(0,),dtype=torch.long,device=self.device)
    episode_logits = torch.empty(size=(0,self.env.action_space.n),device=self.device)
    average_rewards = numpy.empty(shape=(0,), dtype=numpy.float)
    episode_rewards = numpy.empty(shape=(0,), dtype=numpy.float)

    while True:
      #if not self.render:
      #  self.env.render()
      current_x = self.PreProcessing(state)
      x = current_x - previous_x if previous_x is not None else numpy.zeros_like(current_x)
      previous_x = current_x
      action_logits = self.agent(torch.tensor(x).float().unsqueeze(dim=0).to(self.device))
      episode_logits = torch.cat((action_logits,episode_logits),dim=0)
      action = Categorical(logits=action_logits).sample()
      episode_actions = torch.cat((episode_actions,action),dim=0)

      state,reward,done,_ = self.env.step(action = action.cpu().item())
      episode_rewards = numpy.concatenate((episode_rewards,numpy.array([reward])),axis=0)
      average_rewards = numpy.concatenate((average_rewards,numpy.expand_dims(numpy.mean(episode_rewards),axis=0)),axis=0)

      if done:
        episode+=1
        discounted_rewards = PG_RL.get_discounted_rewards(rewards=episode_rewards,gamma=self.gamma)
        discounted_rewards -= average_rewards
        discounted_rewards /= numpy.std(discounted_rewards)
        sum_of_rewards = numpy.sum(episode_rewards)
        mask = one_hot(episode_actions,num_classes=self.env.action_space.n)
        episode_log_probs = torch.sum(mask.float()*log_softmax(episode_logits,dim=1),dim=1)
        episode_weighted_log_probs = episode_log_probs * torch.tensor(discounted_rewards).float().to(self.device)
        sum_weighted_log_probs = torch.sum(episode_weighted_log_probs).unsqueeze(dim=0)
        #show_video()
        return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode
 def agent_step(self):
     with torch.no_grad():
         action_probs, log_probs, termination_probs, q_u, q_omega = self.policy(
             self.state)
         if self.current_option is None:
             self.current_option = Categorical(
                 probs=self._epsilon_probs(q_omega[0])).sample()
         action = Categorical(
             probs=action_probs[0, self.current_option, :]).sample()
         action = action.cpu().detach().numpy()
         # action = self.env.action_space.sample()
         action = int(action)
     next_state, reward, done, info = self.env.step(action)
     # self.env.render()
     self.replay_buffer.add([
         self.state, action, self.current_option, self.previous_option,
         reward, next_state, done
     ])
     self.state = next_state
     self.previous_option = self.current_option
     if done:
         self.agent_reset()
     elif termination_probs[0, self.current_option] >= torch.rand(1):
         self.current_option = None
     return reward, done
Example #3
0
    def evaluate(self, true_labels, all_preds, entropies, **kwargs):
        ood_entropies = np.zeros(0)
        accuracies = []

        with torch.no_grad():
            for batch_num, batch in enumerate(self.ds_loader):
                x, y = batch
                x = x.to(self.device)

                if not self.ensemble:
                    out = self.model(x)
                else:
                    out = 0
                    for model in self.ensemble:
                        out += model(x)
                    out /= len(self.ensemble)
                probs = F.softmax(out, dim=-1)
                preds, _ = torch.max(probs, dim=-1)

                # entropy
                entropy = Categorical(probs).entropy().squeeze()
                entropies = np.concatenate(
                    (entropies, entropy.detach().cpu().numpy()))
                ood_entropies = np.concatenate(
                    (ood_entropies, entropy.cpu().numpy()))

                # accuracy
                predictions = out.argmax(dim=-1, keepdim=True).view_as(y).cpu()
                correct = y.eq(predictions).sum().item()
                acc = correct / out.shape[0]

                accuracies.append(acc)

                true_labels = np.concatenate((true_labels, np.zeros(len(x))))
                all_preds = np.concatenate((all_preds, preds.cpu().reshape(
                    (-1))))

        auroc = calculate_auroc(true_labels, all_preds)
        aupr = calculate_aupr(true_labels, all_preds)

        auroc_entropy = calculate_auroc(1 - true_labels, entropies)
        aupr_entropy = calculate_aupr(1 - true_labels, entropies)

        auroc_name = f'auroc_{self.ds_dataset}'
        aupr_name = f'aupr_{self.ds_dataset}'
        auroc_ent_name = f'auroc_entropy_{self.ds_dataset}'
        aupr_ent_name = f'aupr_entropy_{self.ds_dataset}'
        entropy_name = f'entropy_{self.ds_dataset}'
        acc_name = f"acc_{self.ds_dataset}"

        return {
            acc_name: np.mean(accuracies),
            auroc_name: auroc,
            aupr_name: aupr,
            entropy_name: np.mean(ood_entropies),
            auroc_ent_name: auroc_entropy,
            aupr_ent_name: aupr_entropy
        }
Example #4
0
    def play_ep(self):
        # reset env state after every episode
        state = self.env.reset() 
        prev_x = None
        episode_actions = torch.empty(size=(0,), dtype=torch.long, device=self.device)
        episode_logits = torch.empty(size=(0, 2),device=self.device)
        average_rewards = np.empty(shape=(0,), dtype=np.float)
        episode_rewards = np.empty(shape=(0,), dtype=np.float)
    
        while True:
            # render env for display 
            if self.render_env:
                self.env.render()

            # pre-preprocess current the state and subtract from previous state to add-in motion information
            cur_x = prepro(state)    
            x = cur_x - prev_x if prev_x is not None else np.zeros(self.in_sz).astype(np.float32)
            prev_x = cur_x

            # get choice from network
            action_logit = self.agent(torch.tensor(x).float().unsqueeze(0).to(self.device))
            # add to buffer
            episode_logits = torch.cat((episode_logits, action_logit), dim=0)
            # sample and action and execute the action 
            action = Categorical(logits=action_logit).sample()
            # add to buffer
            episode_actions = torch.cat((episode_actions, action),dim=0)

            state, reward, done, _ = self.env.step(action=action.cpu().item())

            # add to buffer 
            episode_rewards = np.concatenate((episode_rewards, np.array([reward])), axis=0)
            
            # like averaging from 1 to nth time step (on-average return till that time step)
            average_rewards = np.concatenate((average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0)

            if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
                print(('ep #: game finished, reward: %f' % (reward)) + ('' if reward == -1 else ' !!!!!!!!'))
                
            if done: # end of episode
                # get discounted rewards and normalize the return
                discounted_rewards = discount_rewards(episode_rewards, gamma=self.gamma)
                    
                # subtract baseline rewards 
                discounted_rewards -= average_rewards
                    
                # set mask for the actions executed 
                mask = one_hot(episode_actions, num_classes=2)
                
                # similar to cross-entropy for classification but with fake labels and our action confidence
                weighted_ps = torch.sum(mask.float() * log_softmax(episode_logits, dim=1), dim=1)
                    
                # weight the loss with the discounted rewards to get expected reward from distribution 
                episode_weighted_loss = weighted_ps * torch.tensor(discounted_rewards).float().to(self.device)
                
                return episode_weighted_loss, episode_logits, episode_rewards
Example #5
0
    def get_action(self, state_np):
        state_th = torch.tensor(state_np).float()
        action_th = self.forward(state_th)

        if self.type == 'discrete':
            action_sampled_th = Categorical(logits=action_th).sample()
        else:
            raise NotImplementedError

        action_sampled_np = action_sampled_th.cpu().detach().numpy()
        return action_sampled_np
Example #6
0
    def choose_action(self, states, buffer=True):
        probs, values = self.forward(states)
        # print("values:", values)
        # print("probs:", probs)
        actions = Categorical(probs).sample()

        if buffer:
            self.state_buffer.append(states)
            self.value_buffer.append(values)
            self.prob_buffer.append(probs)
            self.action_buffer.append(torch.unsqueeze(actions, 1))
        # print("actions:", actions)
        actions = actions.cpu().numpy() + 1
        values = values.detach().cpu().numpy()
        probs = probs.detach().cpu().numpy()
        return actions, values, probs
Example #7
0
    def evaluate(self, true_labels, all_preds, entropies, **kwargs):
        ood_entropies = np.zeros(0)

        with torch.no_grad():
            for batch_num, batch in enumerate(self.ood_loader):
                x, y = batch
                x = x.float().to(self.device)

                if not self.ensemble:
                    out = self.model(x)
                else:
                    out = 0
                    for model in self.ensemble:
                        out += model(x)
                    out /= len(self.ensemble)
                probs = F.softmax(out, dim=-1)
                preds, _ = torch.max(probs, dim=-1)

                entropy = Categorical(probs).entropy().squeeze()
                entropies = np.concatenate(
                    (entropies, entropy.detach().cpu().numpy()))
                ood_entropies = np.concatenate(
                    (ood_entropies, entropy.cpu().numpy()))

                true_labels = np.concatenate((true_labels, np.zeros(len(x))))
                all_preds = np.concatenate((all_preds, preds.cpu().reshape(
                    (-1))))

        auroc = calculate_auroc(true_labels, all_preds)
        aupr = calculate_aupr(true_labels, all_preds)

        auroc_entropy = calculate_auroc(1 - true_labels, entropies)
        aupr_entropy = calculate_aupr(1 - true_labels, entropies)

        auroc_name = f'auroc_{self.ood_dataset}'
        aupr_name = f'aupr_{self.ood_dataset}'
        auroc_ent_name = f'auroc_entropy_{self.ood_dataset}'
        aupr_ent_name = f'aupr_entropy_{self.ood_dataset}'
        entropy_name = f'entropy_{self.ood_dataset}'

        return {
            auroc_name: auroc,
            aupr_name: aupr,
            entropy_name: np.mean(ood_entropies),
            auroc_ent_name: auroc_entropy,
            aupr_ent_name: aupr_entropy
        }
Example #8
0
    def _step(self, obs, hiddens, masks):

        with torch.no_grad():
            values, action_probs, hiddens = self.model(obs, hiddens, masks)

        actions = Categorical(action_probs.detach()).sample()

        # Sample actions from the output distributions
        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())
        obs = torch.from_numpy(obs)
        rewards = torch.from_numpy(rewards).unsqueeze(1)
        masks = torch.from_numpy(1 - (dones)).unsqueeze(1)
        actions = actions.unsqueeze(1)

        self.rollouts.insert(
            obs,  #next
            hiddens,  #next
            actions,  #now
            action_probs,  #now
            values,  #now
            rewards,  #now
            masks)  #next
Example #9
0
    def get_next_batch(self, env):

        for _ in range(C.NUM_EPOCHS):

            epoch_logits = torch.empty(size=(0, self.action_space_size),
                                       device=self.DEVICE)
            epoch_weighted_log_probs = torch.empty(size=(0, ),
                                                   dtype=torch.float,
                                                   device=self.DEVICE)
            total_rewards = deque([], maxlen=C.BATCH_SIZE_PER_THREAD)

            episode_counter = 0

            while episode_counter < C.BATCH_SIZE_PER_THREAD:

                episode_counter += 1

                # reset the environment to a random initial state every epoch
                state = env.reset()

                # initialize the episode arrays
                episode_actions = torch.empty(size=(0, ),
                                              dtype=torch.long,
                                              device=self.DEVICE)
                episode_logits = torch.empty(size=(0, C.action_space_size),
                                             device=self.DEVICE)
                average_rewards = np.empty(shape=(0, ), dtype=np.float)
                episode_rewards = np.empty(shape=(0, ), dtype=np.float)

                # episode loop
                for step_index in range(0, C.max_simulation_length):

                    # get the action logits from the agent - (preferences)
                    action_logits = self.m(
                        torch.tensor(state).float().unsqueeze(dim=0).to(
                            self.DEVICE))

                    # append the logits to the episode logits list
                    episode_logits = torch.cat((episode_logits, action_logits),
                                               dim=0)

                    # sample an action according to the action distribution
                    action = Categorical(logits=action_logits).sample()

                    # append the action to the episode action list to obtain the trajectory
                    # we need to store the actions and logits so we could calculate the gradient of the performance
                    episode_actions = torch.cat((episode_actions, action),
                                                dim=0)

                    # take the chosen action, observe the reward and the next state
                    state, reward, done, _ = env.step(
                        action=action.cpu().item())

                    # append the reward to the rewards pool that we collect during the episode
                    # we need the rewards so we can calculate the weights for the policy gradient
                    # and the baseline of average
                    episode_rewards = np.concatenate(
                        (episode_rewards, np.array([reward])), axis=0)

                    # here the average reward is state specific
                    average_rewards = np.concatenate(
                        (average_rewards,
                         np.expand_dims(np.mean(episode_rewards), axis=0)),
                        axis=0)

                # turn the rewards we accumulated during the episode into the rewards-to-go:
                # earlier actions are responsible for more rewards than the later taken actions
                discounted_rewards_to_go = utils.get_discounted_rewards(
                    rewards=episode_rewards, gamma=C.GAMMA)
                discounted_rewards_to_go -= average_rewards  # baseline - state specific average

                # calculate the sum of the rewards for the running average metric
                sum_of_rewards = np.sum(episode_rewards)

                # after each episode append the sum of total rewards to the deque
                total_rewards.append(sum_of_rewards)

                # set the mask for the actions taken in the episode
                mask = one_hot(episode_actions,
                               num_classes=C.action_space_size)

                # calculate the log-probabilities of the taken actions
                # mask is needed to filter out log-probabilities of not related logits
                episode_log_probs = torch.sum(
                    mask.float() * log_softmax(episode_logits, dim=1), dim=1)

                # weight the episode log-probabilities by the rewards-to-go
                episode_weighted_log_probs = episode_log_probs * \
                    torch.tensor(discounted_rewards_to_go).float().to(self.DEVICE)

                # calculate the sum over trajectory of the weighted log-probabilities
                sum_weighted_log_probs = torch.sum(
                    episode_weighted_log_probs).unsqueeze(dim=0)

                # append the weighted log-probabilities of actions
                epoch_weighted_log_probs = torch.cat(
                    (epoch_weighted_log_probs, sum_weighted_log_probs), dim=0)

                # append the logits - needed for the entropy bonus calculation
                epoch_logits = torch.cat((epoch_logits, episode_logits), dim=0)

                # calculate the loss
                loss, entropy = utils.calculate_loss(
                    C.BETA,
                    epoch_logits=epoch_logits,
                    weighted_log_probs=epoch_weighted_log_probs)

            yield loss, total_rewards
Example #10
0
    def _update(self, states, actions, rewards, advantages, returns, masks,
                epoch):
        old_model = copy.deepcopy(self.model)

        policy_losses = np.array([])
        entropies = np.array([])
        value_losses = np.array([])
        losses = np.array([])

        for _ in range(self.ppo_epochs):
            rand_list = (torch.randperm(self.batch_num * self.batch_size).view(
                -1, self.batch_size).tolist())

            for ind in rand_list:
                batch = states[ind]
                actor_logits, vals, _ = self.model(batch)
                log_probs = F.log_softmax(actor_logits, dim=1)
                with torch.no_grad():
                    old_actor_logits, _, _ = old_model(batch)
                    old_log_probs = F.log_softmax(old_actor_logits, dim=1)

                adv = advantages[ind].to(self.device)
                advs = advantages.to(self.device)
                adv = (adv - advs.mean()) / (advs.std() + 1e-8)

                A = returns[ind].to(self.device) - vals

                action = actions[ind].to(self.device)

                old_log_probs = old_log_probs.gather(1, action)
                log_probs = log_probs.gather(1, action)

                r = (log_probs - old_log_probs).exp()

                clip = r.clamp(min=1 - self.epsilon, max=1 + self.epsilon)
                L, _ = torch.stack([r * adv.detach(),
                                    clip * adv.detach()]).min(0)
                v_l = A.pow(2).mean()
                L = L.mean()

                entropy = Categorical(F.softmax(actor_logits,
                                                dim=1)).entropy().mean()

                loss = -L + self.v_loss_coef * v_l - self.entropy_coef * entropy

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         self.max_grad_norm)
                self.optimizer.step()

                policy_losses = np.append(policy_losses,
                                          L.cpu().detach().numpy())
                value_losses = np.append(value_losses,
                                         v_l.cpu().detach().numpy())
                losses = np.append(losses, loss.cpu().detach().numpy())
                entropies = np.append(entropies,
                                      entropy.cpu().detach().numpy())

        policy_loss = policy_losses.mean()
        value_loss = value_losses.mean()
        loss = losses.mean()
        entropy = entropies.mean()

        self.writer.add_scalar("PolicyLoss", policy_loss, epoch + 1)
        self.writer.add_scalar("ValueLoss", value_loss, epoch + 1)
        self.writer.add_scalar("Loss", loss, epoch + 1)
        self.writer.add_scalar("Entropy", entropy, epoch + 1)

        del states, actions, rewards, advantages, returns, masks
Example #11
0
def play_episode(environment, device, action_space_size, agent, gamma,
                 episode: int):
    """
            Plays an episode of the environment.
            episode: the episode counter
            Returns:
                sum_weighted_log_probs: the sum of the log-prob of an action multiplied by the reward-to-go from that state
                episode_logits: the logits of every step of the episode - needed to compute entropy for entropy bonus
                finished_rendering_this_epoch: pass-through rendering flag
                sum_of_rewards: sum of the rewards for the episode - needed for the average over 200 episode statistic
        """

    agent.to('cpu')
    device = 'cpu'

    # reset the environment to a random initial state every epoch
    state = environment.reset()

    # initialize the episode arrays
    episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=device)
    episode_logits = torch.empty(size=(0, action_space_size), device=device)
    average_rewards = np.empty(shape=(0, ), dtype=np.float)
    episode_rewards = np.empty(shape=(0, ), dtype=np.float)

    # episode loop
    while True:

        # get the action logits from the agent - (preferences)
        action_logits = agent(
            torch.tensor(state).float().unsqueeze(dim=0).to(device))

        #print('action logits is',action_logits)

        # append the logits to the episode logits list
        episode_logits = torch.cat((episode_logits, action_logits), dim=0)

        # sample an action according to the action distribution
        action = Categorical(logits=action_logits).sample()

        #print('the action after categorical is',action)

        # append the action to the episode action list to obtain the trajectory
        # we need to store the actions and logits so we could calculate the gradient of the performance
        episode_actions = torch.cat((episode_actions, action), dim=0)

        # take the chosen action, observe the reward and the next state
        state, reward, done, _ = environment.step(action=action.cpu().item())

        # append the reward to the rewards pool that we collect during the episode
        # we need the rewards so we can calculate the weights for the policy gradient
        # and the baseline of average
        episode_rewards = np.concatenate((episode_rewards, np.array([reward])),
                                         axis=0)

        # here the average reward is state specific
        average_rewards = np.concatenate(
            (average_rewards, np.expand_dims(np.mean(episode_rewards),
                                             axis=0)),
            axis=0)

        # the episode is over
        if done:

            # increment the episode
            episode += 1

            # turn the rewards we accumulated during the episode into the rewards-to-go:
            # earlier actions are responsible for more rewards than the later taken actions
            discounted_rewards_to_go = utils.get_discounted_rewards(
                rewards=episode_rewards, gamma=gamma)
            discounted_rewards_to_go -= average_rewards  # baseline - state specific average

            # # calculate the sum of the rewards for the running average metric
            sum_of_rewards = np.sum(episode_rewards)

            # set the mask for the actions taken in the episode
            mask = one_hot(episode_actions,
                           num_classes=environment.action_space.n)

            # calculate the log-probabilities of the taken actions
            # mask is needed to filter out log-probabilities of not related logits
            episode_log_probs = torch.sum(mask.float() *
                                          log_softmax(episode_logits, dim=1),
                                          dim=1)

            # weight the episode log-probabilities by the rewards-to-go
            episode_weighted_log_probs = episode_log_probs * \
                torch.tensor(discounted_rewards_to_go).float().to(device)

            # calculate the sum over trajectory of the weighted log-probabilities
            sum_weighted_log_probs = torch.sum(
                episode_weighted_log_probs).unsqueeze(dim=0)

            sum_weighted_log_probs = sum_weighted_log_probs.to('cpu')
            episode_logits = episode_logits.to('cpu')

            sum_weighted_log_probs = sum_weighted_log_probs.to(device)
            episode_logits = episode_logits.to(device)

            return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode
Example #12
0
    def inference(
        self,
        sent_memory_emb,
        graph_memory_emb,
        sent_memory_mask,
        graph_memory_mask,
        max_step,
        use_sampling=False,
    ):
        batch_size, sent_memory_seq, dim = list(sent_memory_emb.shape)
        _, graph_memory_seq, _ = list(graph_memory_emb.shape)

        sent_memory_mask_inv = sent_memory_mask == 0  # [batch, sent_memory_seq]
        graph_memory_mask_inv = graph_memory_mask == 0  # [batch, sent_memory_seq]

        target_ids = [[self.BOS
                       for i in range(batch_size)]]  # [target_seq, batch]
        target_mask = [[1.0] for i in range(batch_size)]  # [batch, target_seq]
        target_prob = []  # [target_seq, batch]
        is_finish = [False for _ in range(batch_size)]
        rows = torch.arange(batch_size).to(device)
        for step in range(max_step):
            cur_seq = step + 1
            cur_emb = self.dec_word_embedding(
                torch.tensor(target_ids).to(device))  # [cur_seq, batch, dim]
            cur_emb = self.position_encoder(cur_emb)  # [cur_seq, batch, dim]

            cur_mask = torch.tensor(target_mask).to(device)
            cur_mask_inv = cur_mask == 0.0  # [batch, cur_seq]
            cur_triu_mask = torch.triu(torch.ones(cur_seq, cur_seq).to(device),
                                       diagonal=1)  # [cur_seq, cur_seq]
            cur_triu_mask.masked_fill_(cur_triu_mask == 1, -1e20)

            cur_emb = self.decoder(
                cur_emb,
                sent_memory_emb,  # [batch, sent_len, dim]
                graph_memory_emb,  # [batch, graph_len, dim]
                tgt_mask=cur_triu_mask,
                tgt_key_padding_mask=cur_mask_inv,
                sent_memory_key_padding_mask=sent_memory_mask_inv,
                graph_memory_key_padding_mask=graph_memory_mask_inv,
            )  # [batch, cur_seq, dim]

            assert has_nan(cur_emb) is False

            # break after the first time when all items are finished
            if all(is_finish) or step == max_step - 1:
                cur_len = cur_mask.sum(dim=1).long()
                target_vec = universal_sentence_embedding(
                    cur_emb, cur_mask, cur_len)
                break

            # generating step outputs
            logits = self.projector(cur_emb[:, -1, :]).view(
                batch_size, self.word_vocab_size)  # [batch, vocab]
            if use_sampling is False:
                indices = logits.argmax(dim=1)  # [batch]
            else:
                indices = Categorical(logits=logits).sample()  # [batch]

            prob = F.softmax(logits, dim=1)[rows, indices]  # [batch]
            target_prob.append(prob)
            indices = indices.cpu().tolist()
            target_ids.append(indices)
            for i in range(batch_size):
                target_mask[i].append(
                    0.0 if is_finish[i] else
                    1.0)  # based on if is_finish in the last step

            for i in range(batch_size):
                is_finish[i] |= indices[i] == self.EOS

        target_ids = list(map(list,
                              zip(*target_ids[1:])))  # [batch, target_seq]
        target_mask = torch.tensor([x[1:] for x in target_mask
                                    ]).to(device)  # [batch, target_seq]
        target_prob = torch.stack(target_prob, dim=1)  # [batch, target_seq]
        return target_vec, target_ids, target_prob, target_mask
Example #13
0
    def predict_mstcn(self,
                      model_dir,
                      results_dir,
                      features_path,
                      vid_list_file,
                      epoch,
                      actions_dict,
                      device,
                      sample_rate,
                      bsn_result_path,
                      mstcn_use_lbp,
                      poolingLength=99):
        self.model.eval()
        inverse_dict = {v: k for k, v in actions_dict.items()}
        lbp = LocalBarrierPooling(poolingLength)
        lbp = lbp.to(device)

        with torch.no_grad():
            self.model.to(device)
            self.model.load_state_dict(
                torch.load(model_dir + "/epoch-" + str(epoch) + ".model"))
            file_ptr = open(vid_list_file, 'r')
            list_of_vids = file_ptr.read().split('\n')[:-1]
            file_ptr.close()
            for vid in list_of_vids:
                print(vid)
                features = np.load(features_path + vid.split('.')[0] + '.npy')
                features = features[:, ::sample_rate]
                if mstcn_use_lbp:
                    num_frames = np.shape(features)[1]
                    barrier_file = bsn_result_path + vid + ".csv"
                    barrier = np.array(pd.read_csv(barrier_file))
                    temporal_scale = np.shape(barrier)[0]
                    barrier = np.transpose(barrier)
                    barrier = torch.tensor(
                        barrier, dtype=torch.float)  #size=[num_frames]
                    if temporal_scale <= num_frames:
                        resize_barrier = F.interpolate(barrier,
                                                       size=num_frames,
                                                       mode='nearest')
                    else:
                        resize_barrier = barrier
                    resize_barrier = resize_barrier.unsqueeze(0)
                    resize_barrier = resize_barrier.unsqueeze(
                        0)  # size=[1,1,num_frames]
                    resize_barrier = resize_barrier.to(device)

                input_x = torch.tensor(features, dtype=torch.float)
                input_x.unsqueeze_(0)
                input_x = input_x.to(device)
                predictions = self.model(
                    input_x, torch.ones(input_x.size(), device=device))
                predictions = predictions[-1]
                if mstcn_use_lbp:
                    if temporal_scale <= num_frames:
                        predictions = lbp(predictions, resize_barrier)
                    else:
                        predictions = F.interpolate(predictions,
                                                    size=temporal_scale,
                                                    mode='linear',
                                                    align_corners=False)
                        predictions = lbp(predictions, resize_barrier)
                        predictions = F.interpolate(predictions,
                                                    size=num_frames,
                                                    mode='linear',
                                                    align_corners=False)
                predictions = F.softmax(predictions, dim=1)
                entropy = Categorical(
                    probs=predictions.squeeze(0).transpose(1, 0)).entropy()
                entropy = entropy.cpu().numpy().astype(np.str)

                f_name = vid.split('/')[-1].split('.')[0]
                f_ptr = open(results_dir + "/entropy_" + f_name, "w")
                f_ptr.write(' '.join(entropy))
                f_ptr.close()

                _, predicted = torch.max(predictions.data, 1)
                predicted = predicted.squeeze()
                recognition = []
                for i in range(len(predicted)):
                    recognition = np.concatenate(
                        (recognition,
                         [inverse_dict[predicted[i].item()]] * sample_rate))
                f_name = vid.split('/')[-1].split('.')[0]
                f_ptr = open(results_dir + "/" + f_name, "w")
                f_ptr.write("### Frame level recognition: ###\n")
                f_ptr.write(' '.join(recognition))
                f_ptr.close()
Example #14
0
    def evaluate(self, **kwargs):
        true_labels = np.zeros(0)
        all_preds = np.zeros(0)
        all_correct = np.zeros(0)
        conf_true_labels = np.zeros(0)
        brier_scores = []
        entropies = np.zeros(0)
        acc = []
        nll = []

        with torch.no_grad():
            for batch_num, batch in enumerate(self.test_loader):
                x, y = batch
                x = x.to(self.device)

                if not self.ensemble:
                    out = self.model(x)
                else:
                    out = 0
                    for model in self.ensemble:
                        out += model(x)
                    out /= len(self.ensemble)
                # Logits to probability distribution
                probs = F.softmax(out, dim=-1)
                # Maximum softmax probability
                preds, indices = torch.max(probs, dim=-1)
                # Label predictions
                label_preds = probs.argmax(dim=-1, keepdim=True).view_as(y)
                # Compute accuracy
                corrects = y.eq(label_preds.cpu())
                correct = corrects.sum().item()
                acc.append(correct / out.shape[0])

                all_correct = np.concatenate(
                    (all_correct, corrects.cpu().numpy()))

                # Compute entropy
                entropy = Categorical(probs).entropy().squeeze()
                entropies = np.concatenate((entropies, entropy.cpu().numpy()))

                # Compute brier score
                brier_scores.append(calculate_brier_score(probs, y))

                # Compute NLL
                nll.append(-np.mean(np.log(preds.cpu().numpy())))

                true_labels = np.concatenate((true_labels, np.ones(len(x))))
                all_preds = np.concatenate((all_preds, preds.cpu().reshape(
                    (-1))))
                conf_true_labels = np.concatenate(
                    (conf_true_labels, torch.isclose(
                        y.cpu(),
                        indices.cpu()).numpy().astype(float).reshape(-1)))

        conf_auroc = calculate_auroc(conf_true_labels, all_preds)
        conf_aupr = calculate_aupr(conf_true_labels, all_preds)
        brier_score = np.mean(np.array(brier_scores))
        ece = calculate_ece(all_preds, all_correct)

        return {
            'conf_auroc': conf_auroc,
            'conf_aupr': conf_aupr,
            'brier_score': brier_score,
            'entropy': np.mean(entropies),
            'test_acc': np.mean(acc),
            'nll': np.mean(nll),
            'ece': ece,
        }, true_labels, all_preds, entropies
Example #15
0
 def act(self, x):
     with torch.no_grad():
         logits = self(x)
         m = Categorical(logits=logits).sample().squeeze()
     return m.cpu().item()
Example #16
0
def main():

    # make the environments
    if args.num_envs == 1:
        env = [gym.make(args.env_name)]
    else:
        env = [gym.make(args.env_name) for i in range(args.num_envs)]

    env = MultiGym(env, render=args.render)

    n_states = env.observation_space.shape
    n_actions = env.action_space.n
    print('state shape:', n_states, 'actions:', n_actions)

    policy = ConvPolicy(n_actions).to(device)
    optimizer = optim.RMSprop(policy.parameters(), lr=args.lr)

    if args.algo == 'ppo':
        sys.path.append('../')
        from algorithms.ppo import PPO
        update_algo = PPO(policy=policy,
                          optimizer=optimizer,
                          num_steps=args.num_steps,
                          num_envs=args.num_envs,
                          state_size=(4, 105, 80),
                          entropy_coef=args.entropy,
                          gamma=args.gamma,
                          device=device,
                          epochs=args.ppo_epochs)
    else:
        sys.path.append('../')
        from algorithms.a2c import A2C
        update_algo = A2C(policy=policy,
                          optimizer=optimizer,
                          num_steps=args.num_steps,
                          num_envs=args.num_envs,
                          state_size=(4, 105, 80),
                          entropy_coef=args.entropy,
                          gamma=args.gamma,
                          device=device)

    end_rewards = []

    try:
        print('starting episodes')
        idx = 0
        d = False
        reward_sum = np.zeros((args.num_envs))
        restart = True
        frame = env.reset()
        mask = torch.ones(args.num_envs)
        all_start = time.time()

        for update_idx in range(args.num_updates):
            update_algo.policy.train()

            # stack the frames
            s = train_state_proc.proc_state(frame, mask=mask)

            # insert state before getting actions
            update_algo.states[0].copy_(s)

            start = time.time()
            for step in range(args.num_steps):

                with torch.no_grad():
                    # get probability dist and values
                    p, v = update_algo.policy(update_algo.states[step])
                    a = Categorical(p).sample()

                # take action get response
                frame, r, d = env.step(
                    a.cpu().numpy() if args.num_envs > 1 else [a.item()])
                s = train_state_proc.proc_state(frame, mask)

                update_algo.insert_experience(step=step,
                                              s=s,
                                              a=a,
                                              v=v,
                                              r=r,
                                              d=d)

                mask = torch.tensor(1. - d).float()
                reward_sum = (reward_sum + r)

                # if any episode finished append episode reward to list
                if d.any():
                    end_rewards.extend(reward_sum[d])

                # reset any rewards that finished
                reward_sum = reward_sum * mask.numpy()

                idx += 1

            with torch.no_grad():
                _, next_val = update_algo.policy(update_algo.states[-1])

            update_algo.update(next_val.view(1, args.num_envs).to(device),
                               next_mask=mask.to(device))

            if args.lr_decay:
                for params in update_algo.optimizer.param_groups:
                    params['lr'] = (
                        lr_min + 0.5 * (args.lr - lr_min) *
                        (1 + np.cos(np.pi * idx / args.num_updates)))

            # update every so often by displaying results in term
            if (update_idx % args.log_interval
                    == 0) and (len(end_rewards) > 0):
                total_steps = (idx + 1) * args.num_envs * args.num_steps
                end = time.time()
                print(end_rewards[-10:])
                print('Updates {}\t  Time: {:.4f} \t FPS: {}'.format(
                    update_idx, end - start,
                    int(total_steps / (end - all_start))))
                print(
                    'Mean Episode Rewards: {:.2f} \t Min/Max Current Rewards: {}/{}'
                    .format(np.mean(end_rewards[-10:]), reward_sum.min(),
                            reward_sum.max()))

    except KeyboardInterrupt:
        pass

    torch.save(
        update_algo.policy.state_dict(),
        '../model_weights/{}_{}_conv.pth'.format(args.env_name, args.algo))

    import pandas as pd

    out_dict = {'avg_end_rewards': end_rewards}
    out_log = pd.DataFrame(out_dict)
    out_log.to_csv('../logs/{}_{}_rewards.csv'.format(args.env_name,
                                                      args.algo),
                   index=False)

    out_dict = {
        'actor losses': update_algo.actor_losses,
        'critic losses': update_algo.critic_losses,
        'entropy': update_algo.entropy_logs
    }
    out_log = pd.DataFrame(out_dict)
    out_log.to_csv('../logs/{}_{}_training_behavior.csv'.format(
        args.env_name, args.algo),
                   index=False)

    plt.plot(end_rewards)
    plt.show()