def init_double_heap(length: int): """initialize heap with 2 actions for pi2forward agent""" reward_statistics = [] for action_one_idx in range(pow(2, length)): action_one = Utility.get_bitstring_from_decimal(action_one_idx, length) for action_two_idx in range(pow(2, length)): action_two = Utility.get_bitstring_from_decimal(action_two_idx, length) # (expected_reward, actions, number_of_times) heapq.heappush(reward_statistics, (2, action_one + action_two, 0)) return reward_statistics
def init_table(self) -> dict: """Returns a dict where every observation has an ordered list of actions with rewards""" table = {} a_length = self.environment.action_length o_length = self.environment.observation_length if o_length == 0: table[''] = Utility.init_heap(a_length) else: for observation_idx in range(pow(2, o_length)): observation = format(observation_idx, 'b').zfill(o_length) table[observation] = Utility.init_heap(a_length) return table
def trial(pair: Tuple[Type[Agent], Type[Environment]], sign_bit: str, num_trials: int = 100, num_cycles: int = 10000, seed: int = 1) -> Tuple[str, str, str, list]: """Trial agent in environment""" agent_class, environment_class = pair[0], pair[1] ag_name, env_name = agent_class.__name__, environment_class.__name__ rewards = [] sign = "positive" if sign_bit == "0" else "negative" a, m = 16807, (pow(2, 31) - 1) for trial_idx in range(num_trials): total_reward = 0 environment = environment_class(sign_bit, seed=seed) seed = (seed * a) % m agent = agent_class(environment=environment, seed=seed) seed = (seed * a) % m observation = "0" * environment.observation_length for cycle_idx in range(num_cycles): action = agent.calculate_action(observation) observation, reward = environment.calculate_percept(action) total_reward += Utility.get_reward_from_bitstring(reward) agent.train(reward) total_reward /= num_cycles * environment_class.max_average_reward_per_cycle rewards.append(total_reward) return ag_name, env_name, sign, rewards
def calculate_action(self, observation: str) \ -> str: """Feed percept into nn and calculate best activations action. Returns action.""" action = "" reward = -2 # calculate expected reward by trying every action number_of_actions = pow(2, self.environment.action_length) if self.seeded_rand_range(0, 10) == 0: action_idx = self.seeded_rand_range(0, number_of_actions) action_string = format(action_idx, 'b').zfill(self.environment.action_length) nn_input = NNUtility.bitstr_to_narray(observation + action_string) nn_output = self.nn.forward(nn_input) action = action_string self.activations = nn_output else: for action_idx in range(number_of_actions): action_string = format(action_idx, 'b').zfill(self.environment.action_length) nn_input = NNUtility.bitstr_to_narray(observation + action_string) nn_output = self.nn.forward(nn_input) reward_string = NNUtility.narray_to_bitstr(nn_output[1][-1]) action_reward = Utility.get_reward_from_bitstring(reward_string) if action_reward == reward: i = self.seeded_rand_range() if i == 1: action = action_string self.activations = nn_output else: if action_reward > reward: action = action_string reward = action_reward self.activations = nn_output return action
def train(self, reward: str): """Add returned reward to statistic""" reward_value = -1 * Utility.get_reward_from_bitstring(reward) expected_reward = self.action_statistic[0] action = self.action_statistic[1] cnt = self.action_statistic[2] new_expected_reward = (expected_reward * cnt + reward_value) / (cnt + 1) heapq.heappush(self.table[self.observation], (new_expected_reward, action, cnt + 1))
def train(self, reward: str): """Add returned reward to statistic""" self.cnt += 1 self.r = self.nr self.nr = -1 * Utility.get_reward_from_bitstring(reward) action_heap = self.table[self.sl_o][self.sl_a][self.l_o] for idx in range(len(action_heap)): if action_heap[idx][1] == self.l_a + self.a: expected_reward = action_heap[idx][0] reward = self.r + self.nr cnt = action_heap[idx][2] new_expected_reward = (expected_reward * cnt + reward) / (cnt + 1) action_heap[idx] = (new_expected_reward, action_heap[idx][1], cnt + 1) if expected_reward < reward: Utility.heapq_siftup(action_heap, idx) else: Utility.heapq_siftdown(action_heap, 0, idx) break
def init_table(self) -> dict: """Returns a dict with reward statistics for every combination of oao|ar""" table = {} a_length = self.environment.action_length o_length = self.environment.observation_length if o_length == 0: table[''] = {} for action_idx in range(pow(2, a_length)): action = Utility.get_bitstring_from_decimal( action_idx, a_length) table[''][action] = {} table[''][action][''] = Utility.init_heap(a_length) else: for last_observation_idx in range(pow(2, o_length)): last_observation = Utility.get_bitstring_from_decimal( last_observation_idx, o_length) table[last_observation] = {} for action_idx in range(pow(2, a_length)): action = Utility.get_bitstring_from_decimal( action_idx, a_length) table[last_observation][action] = {} for observation_idx in range(pow(2, o_length)): observation = Utility.get_bitstring_from_decimal( observation_idx, o_length) table[last_observation][action][ observation] = Utility.init_heap(a_length) return table
class NNAgentSigmoid(Agent): """Template for neural network based agents""" data_dir = Utility.get_data_path() learning_rate = 0.01 def __init__(self, environment: Environment, seed: int, hidden_size: [int], activation_name: str = "relu"): """Load appropriate parameters depending on environment and learning time""" super().__init__(environment, seed) input_size = [self.environment.observation_length + self.environment.action_length] output_size = [self.environment.reward_length] self.nn = NeuralNetworkSigmoid(activation_name, input_size + hidden_size + output_size) self.activations = ([], []) def calculate_action(self, observation: str) \ -> str: """Feed percept into nn and calculate best activations action. Returns action.""" action = "" reward = -2 # calculate expected reward by trying every action number_of_actions = pow(2, self.environment.action_length) if self.seeded_rand_range(0, 10) == 0: action_idx = self.seeded_rand_range(0, number_of_actions) action_string = format(action_idx, 'b').zfill(self.environment.action_length) nn_input = NNUtility.bitstr_to_narray(observation + action_string) nn_output = self.nn.forward(nn_input) action = action_string self.activations = nn_output else: for action_idx in range(number_of_actions): action_string = format(action_idx, 'b').zfill(self.environment.action_length) nn_input = NNUtility.bitstr_to_narray(observation + action_string) nn_output = self.nn.forward(nn_input) reward_string = NNUtility.narray_to_bitstr(nn_output[1][-1]) action_reward = Utility.get_reward_from_bitstring(reward_string) if action_reward == reward: i = self.seeded_rand_range() if i == 1: action = action_string self.activations = nn_output else: if action_reward > reward: action = action_string reward = action_reward self.activations = nn_output return action def train(self, reward: str): """Train agent on received reward""" self.nn.backward(self.activations, NNUtility.bitstr_to_narray(reward))
import pickle import matplotlib.pyplot as plt from python.src import Utility from matplotlib import rc rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 12}) rc('text', usetex=True) plt.rcParams['axes.axisbelow'] = True data_dir_path = Utility.get_data_path() plots_path = Utility.get_plots_path() apiq_dict_path = data_dir_path.joinpath("apiq_dict.apiq") apiq_dict = pickle.load(apiq_dict_path.open("rb")) # bar plot of apiq values f = plt.figure(figsize=(6, 4), dpi=400) agents = list(apiq_dict.keys()) apiq_values = [v["mean"] for v in apiq_dict.values()] apiq_errors = [v["error"] for v in apiq_dict.values()] ypos = [len(agents) - 1 - y for y in range(len(agents))] plt.barh(ypos[:4], apiq_values[:4], color="#076678") plt.barh(ypos[4:5], apiq_values[4:5], color="#689d6a") plt.barh(ypos[5:], apiq_values[5:], color="#8f3f71") plt.errorbar(apiq_values, ypos, xerr=apiq_errors, fmt=',', ecolor='black', capsize=4) plt.yticks( ypos,