Ejemplo n.º 1
0
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(Variable(state))
    m = Multinomial(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.data[0]
Ejemplo n.º 2
0
    def decide(self, choices: List[any]) -> int:

        inputs = list(map(lambda choice: torch.FloatTensor(choice), choices))
        enhanced_features = list(
            map(lambda vec: self._base_network.model.forward(vec), inputs))
        action_features = list(
            map(lambda vec: self._policy_gradient.model.forward(vec.detach()),
                enhanced_features))

        # Get move
        probabilities = Function.softmax(torch.cat(list(action_features)))
        distribution = Multinomial(1, probabilities)
        move = distribution.sample()
        _, index_of_move = move.max(0)

        # Expected reward
        expected_reward = self._value_function.model(
            enhanced_features[index_of_move])
        log_probability = distribution.log_prob(move)

        # Record estimate
        self.rounds.append(
            Round(value=expected_reward, log_probability=log_probability))

        # Return
        return index_of_move.item()
Ejemplo n.º 3
0
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs, state_value = model(Variable(state))
    m = Multinomial(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.data[0]
Ejemplo n.º 4
0
def select_action(state, variance=1, temp=10):
    # this function selects stochastic actions based on the policy probabilities
    state = torch.from_numpy(state).float().unsqueeze(0)
    action_scores = actor(state)
    prob = F.softmax(action_scores / temp, dim=1)  #
    m = Multinomial(vaccine_supply, prob[0])
    action = m.sample()
    log_prob = m.log_prob(action)
    entropy = -(log_prob * prob).sum(1, keepdim=True)
    return action.numpy(), log_prob, entropy
Ejemplo n.º 5
0
def select_action(state, variance=1, temp=10):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    action_scores = actor(state)
    print(action_scores, file=myLog)
    prob = F.softmax(action_scores / temp, dim=1)  #
    #print('***',prob)
    m = Multinomial(vaccine_supply, prob[0])  #[0]
    action = m.sample()
    #print(action)
    log_prob = m.log_prob(action)
    entropy = -torch.sum(torch.log(prob) * prob, axis=-1)
    return action.numpy(), log_prob, entropy
Ejemplo n.º 6
0
def select_action(state, variance=1, temp=10):
    # this function selects stochastic actions based on the policy probabilities
    #state = torch.from_numpy(np.array(state)).float().unsqueeze(0)   #Reza: this might be a bit faster torch.tensor(state,dtype=torch.float32).unsqueeze(0)
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

    action_scores = actor(state)
    print(action_scores, file=myLog)
    prob = F.softmax(action_scores / temp, dim=1)  #
    #print('***',prob)
    m = Multinomial(vaccine_supply, prob[0])  #[0]
    action = m.sample()
    log_prob = m.log_prob(action)
    entropy = -torch.sum(torch.log(prob) * prob, axis=-1)
    return action.numpy(), log_prob, entropy
    def select_action(self, state, temp=1):
        # this function selects stochastic actions based on the policy probabilities
        state = torch.tensor(state, dtype=torch.float32,
                             device=self.device).unsqueeze(0)
        logits = self.actor(state)

        # TODO: check this one later
        logits_norm = (logits - torch.mean(logits)) / \
                             (torch.std(logits) + 1e-5)

        m = Multinomial(self.args.vaccine_supply,
                        logits=logits_norm.squeeze() / temp)
        action = m.sample()
        log_prob = m.log_prob(action)
        entropy = -torch.sum(m.logits * m.probs)
        return action.to('cpu').numpy(), log_prob, entropy
Ejemplo n.º 8
0
def select_action(state, variance=1, temp=1):
    # this function selects stochastic actions based on the policy probabilities    
    # state = torch.from_numpy(np.array(state)).float().unsqueeze(0)   #Reza: this might be a bit faster torch.tensor(state,dtype=torch.float32).unsqueeze(0)
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    action_scores = actor(state)
    action_scores_norm = (action_scores-torch.mean(action_scores))/\
                         (torch.std(action_scores)+1e-5)
    # print(action_scores, file=myLog)
    # prob = F.softmax(action_scores_norm , dim=1)
    # print('***',prob)
    m = Multinomial(vaccine_supply, logits=action_scores_norm.squeeze()/ temp)
    # m = Multinomial(vaccine_supply, prob[0])  # [0]
    action = m.sample()
    log_prob = m.log_prob(action)
    # entropy = - torch.sum(torch.log(prob) * prob, axis=-1)
    entropy = -torch.sum(m.logits* m.probs, axis=-1)
    return action.to('cpu').numpy(), log_prob, entropy
Ejemplo n.º 9
0
    def evaluate(self, possible_boards):
        # possible_boards -> neural network -> sigmoid -> last_layer_sigmoid
        last_layer_outputs = self.run_through_neural_network(possible_boards)
        # last_layer_sigmoid = list(map(lambda x: x.sigmoid(), last_layer_outputs))

        # Decide move and save log_prob for backward
        # We make sure not to affect the value fn with .detach()

        probs = self.pg_plugin._softmax(last_layer_outputs)
        distribution = Multinomial(1, probs)
        move = distribution.sample()
        self.saved_log_probabilities.append(distribution.log_prob(move))

        _, move = move.max(0)
        # calculate the value estimation and save for backward
        value_estimate = self.pg_plugin.value_model(last_layer_outputs[move])
        self.saved_value_estimations.append(value_estimate)
        return move