Ejemplo n.º 1
0
def init_double_heap(length: int):
    """initialize heap with 2 actions for pi2forward agent"""
    reward_statistics = []
    for action_one_idx in range(pow(2, length)):
        action_one = Utility.get_bitstring_from_decimal(action_one_idx, length)
        for action_two_idx in range(pow(2, length)):
            action_two = Utility.get_bitstring_from_decimal(action_two_idx, length)
            # (expected_reward, actions, number_of_times)
            heapq.heappush(reward_statistics, (2, action_one + action_two, 0))
    return reward_statistics
Ejemplo n.º 2
0
 def init_table(self) -> dict:
     """Returns a dict where every observation has an ordered list of actions with rewards"""
     table = {}
     a_length = self.environment.action_length
     o_length = self.environment.observation_length
     if o_length == 0:
         table[''] = Utility.init_heap(a_length)
     else:
         for observation_idx in range(pow(2, o_length)):
             observation = format(observation_idx, 'b').zfill(o_length)
             table[observation] = Utility.init_heap(a_length)
     return table
Ejemplo n.º 3
0
def trial(pair: Tuple[Type[Agent], Type[Environment]],
          sign_bit: str,
          num_trials: int = 100,
          num_cycles: int = 10000,
          seed: int = 1) -> Tuple[str, str, str, list]:
    """Trial agent in environment"""
    agent_class, environment_class = pair[0], pair[1]
    ag_name, env_name = agent_class.__name__, environment_class.__name__
    rewards = []
    sign = "positive" if sign_bit == "0" else "negative"
    a, m = 16807, (pow(2, 31) - 1)
    for trial_idx in range(num_trials):
        total_reward = 0
        environment = environment_class(sign_bit, seed=seed)
        seed = (seed * a) % m
        agent = agent_class(environment=environment, seed=seed)
        seed = (seed * a) % m
        observation = "0" * environment.observation_length
        for cycle_idx in range(num_cycles):
            action = agent.calculate_action(observation)
            observation, reward = environment.calculate_percept(action)
            total_reward += Utility.get_reward_from_bitstring(reward)
            agent.train(reward)
        total_reward /= num_cycles * environment_class.max_average_reward_per_cycle
        rewards.append(total_reward)
    return ag_name, env_name, sign, rewards
Ejemplo n.º 4
0
 def calculate_action(self, observation: str) \
         -> str:
     """Feed percept into nn and calculate best activations action. Returns action."""
     action = ""
     reward = -2
     # calculate expected reward by trying every action
     number_of_actions = pow(2, self.environment.action_length)
     if self.seeded_rand_range(0, 10) == 0:
         action_idx = self.seeded_rand_range(0, number_of_actions)
         action_string = format(action_idx, 'b').zfill(self.environment.action_length)
         nn_input = NNUtility.bitstr_to_narray(observation + action_string)
         nn_output = self.nn.forward(nn_input)
         action = action_string
         self.activations = nn_output
     else:
         for action_idx in range(number_of_actions):
             action_string = format(action_idx, 'b').zfill(self.environment.action_length)
             nn_input = NNUtility.bitstr_to_narray(observation + action_string)
             nn_output = self.nn.forward(nn_input)
             reward_string = NNUtility.narray_to_bitstr(nn_output[1][-1])
             action_reward = Utility.get_reward_from_bitstring(reward_string)
             if action_reward == reward:
                 i = self.seeded_rand_range()
                 if i == 1:
                     action = action_string
                     self.activations = nn_output
             else:
                 if action_reward > reward:
                     action = action_string
                     reward = action_reward
                     self.activations = nn_output
     return action
Ejemplo n.º 5
0
 def train(self, reward: str):
     """Add returned reward to statistic"""
     reward_value = -1 * Utility.get_reward_from_bitstring(reward)
     expected_reward = self.action_statistic[0]
     action = self.action_statistic[1]
     cnt = self.action_statistic[2]
     new_expected_reward = (expected_reward * cnt + reward_value) / (cnt + 1)
     heapq.heappush(self.table[self.observation], (new_expected_reward, action, cnt + 1))
Ejemplo n.º 6
0
 def train(self, reward: str):
     """Add returned reward to statistic"""
     self.cnt += 1
     self.r = self.nr
     self.nr = -1 * Utility.get_reward_from_bitstring(reward)
     action_heap = self.table[self.sl_o][self.sl_a][self.l_o]
     for idx in range(len(action_heap)):
         if action_heap[idx][1] == self.l_a + self.a:
             expected_reward = action_heap[idx][0]
             reward = self.r + self.nr
             cnt = action_heap[idx][2]
             new_expected_reward = (expected_reward * cnt + reward) / (cnt + 1)
             action_heap[idx] = (new_expected_reward, action_heap[idx][1], cnt + 1)
             if expected_reward < reward:
                 Utility.heapq_siftup(action_heap, idx)
             else:
                 Utility.heapq_siftdown(action_heap, 0, idx)
             break
Ejemplo n.º 7
0
 def init_table(self) -> dict:
     """Returns a dict with reward statistics for every combination of oao|ar"""
     table = {}
     a_length = self.environment.action_length
     o_length = self.environment.observation_length
     if o_length == 0:
         table[''] = {}
         for action_idx in range(pow(2, a_length)):
             action = Utility.get_bitstring_from_decimal(
                 action_idx, a_length)
             table[''][action] = {}
             table[''][action][''] = Utility.init_heap(a_length)
     else:
         for last_observation_idx in range(pow(2, o_length)):
             last_observation = Utility.get_bitstring_from_decimal(
                 last_observation_idx, o_length)
             table[last_observation] = {}
             for action_idx in range(pow(2, a_length)):
                 action = Utility.get_bitstring_from_decimal(
                     action_idx, a_length)
                 table[last_observation][action] = {}
                 for observation_idx in range(pow(2, o_length)):
                     observation = Utility.get_bitstring_from_decimal(
                         observation_idx, o_length)
                     table[last_observation][action][
                         observation] = Utility.init_heap(a_length)
     return table
Ejemplo n.º 8
0
class NNAgentSigmoid(Agent):
    """Template for neural network based agents"""

    data_dir = Utility.get_data_path()
    learning_rate = 0.01

    def __init__(self, environment: Environment, seed: int, hidden_size: [int], activation_name: str = "relu"):
        """Load appropriate parameters depending on environment and learning time"""
        super().__init__(environment, seed)
        input_size = [self.environment.observation_length + self.environment.action_length]
        output_size = [self.environment.reward_length]
        self.nn = NeuralNetworkSigmoid(activation_name, input_size + hidden_size + output_size)
        self.activations = ([], [])

    def calculate_action(self, observation: str) \
            -> str:
        """Feed percept into nn and calculate best activations action. Returns action."""
        action = ""
        reward = -2
        # calculate expected reward by trying every action
        number_of_actions = pow(2, self.environment.action_length)
        if self.seeded_rand_range(0, 10) == 0:
            action_idx = self.seeded_rand_range(0, number_of_actions)
            action_string = format(action_idx, 'b').zfill(self.environment.action_length)
            nn_input = NNUtility.bitstr_to_narray(observation + action_string)
            nn_output = self.nn.forward(nn_input)
            action = action_string
            self.activations = nn_output
        else:
            for action_idx in range(number_of_actions):
                action_string = format(action_idx, 'b').zfill(self.environment.action_length)
                nn_input = NNUtility.bitstr_to_narray(observation + action_string)
                nn_output = self.nn.forward(nn_input)
                reward_string = NNUtility.narray_to_bitstr(nn_output[1][-1])
                action_reward = Utility.get_reward_from_bitstring(reward_string)
                if action_reward == reward:
                    i = self.seeded_rand_range()
                    if i == 1:
                        action = action_string
                        self.activations = nn_output
                else:
                    if action_reward > reward:
                        action = action_string
                        reward = action_reward
                        self.activations = nn_output
        return action

    def train(self, reward: str):
        """Train agent on received reward"""
        self.nn.backward(self.activations, NNUtility.bitstr_to_narray(reward))
Ejemplo n.º 9
0
import pickle
import matplotlib.pyplot as plt
from python.src import Utility
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 12})
rc('text', usetex=True)
plt.rcParams['axes.axisbelow'] = True

data_dir_path = Utility.get_data_path()
plots_path = Utility.get_plots_path()

apiq_dict_path = data_dir_path.joinpath("apiq_dict.apiq")
apiq_dict = pickle.load(apiq_dict_path.open("rb"))

# bar plot of apiq values
f = plt.figure(figsize=(6, 4), dpi=400)
agents = list(apiq_dict.keys())
apiq_values = [v["mean"] for v in apiq_dict.values()]
apiq_errors = [v["error"] for v in apiq_dict.values()]
ypos = [len(agents) - 1 - y for y in range(len(agents))]
plt.barh(ypos[:4], apiq_values[:4], color="#076678")
plt.barh(ypos[4:5], apiq_values[4:5], color="#689d6a")
plt.barh(ypos[5:], apiq_values[5:], color="#8f3f71")
plt.errorbar(apiq_values,
             ypos,
             xerr=apiq_errors,
             fmt=',',
             ecolor='black',
             capsize=4)
plt.yticks(
    ypos,