def __init__(self, state, comm, action_id, reward, state_prime, comm_prime, done, env): # converting the state, state_prime to scaled ones scaled_state = scale_state(state, env) scaled_state_prime = scale_state(state_prime, env) # This might need to be changed based on what is communicated scaled_comm = scale_state(comm, env) scaled_comm_prime = scale_state(comm_prime, env) # one_hot the action action = one_hot(action_id, nr_actions=env.nr_actions) self.scaled_state = scaled_state self.scaled_comm = scaled_comm self.action = action self.reward = reward self.scaled_state_prime = scaled_state_prime self.scaled_comm_prime = scaled_comm_prime self.done = done
state = np.array([[0, 0]]) state_history = [] reward_history = [] action_history = [] env.terminated = False steps = 0 while not env.terminated: action_id = agent.action_based_on_policy(state) new_state, reward = env.step(action_id, state) reward_history.append(reward) state_history.append(state) action_history.append(one_hot(action_id, 4)) state = new_state + 0 steps += 1 print("... terminated at: " + str(steps)) print("... reshaping the data") state_history = shape_adopter(state_history, 2) action_history = shape_adopter(action_history, 4) reward_history = shape_adopter(reward_history, 1) reward_to_go = calulate_reward_to_go(reward_history, gamma=0.95) reward_weighted_actions = np.multiply(action_history, reward_to_go)
print("... initial state is "+str(initial_state)) state_1, terminated_1, steps_1 = initializer(initial_state[0, 0], initial_state[0, 1]) state_2, terminated_2, steps_2 = initializer(initial_state[0, 0], initial_state[0, 1]) step = 0 filename = '' animations_dir = 'animations/' os.makedirs(animations_dir) while not terminated_1 and not terminated_2: # the first agent # print("agent 1") action_id = agent_1.action_based_on_policy(state_1, env) one_hot_action = one_hot(action_id, nr_actions) new_state, reward, terminated_1 = env.step(action_id, state_1) scaled_state_1 = scale_state(state_1, env) #histories_1.appending(reward, scaled_state_1, one_hot_action) plt.scatter(state_1[0, 0], state_1[0, 1], s=100, c='#C1C7C9', marker='s') plt.scatter(new_state[0, 0], new_state[0, 1], s=50, c='red') plt.show() plt.pause(0.1) state_1, steps_1 = update_state_step(new_state, steps_1) # the second agent # print("agent 2") action_id = agent_2.action_based_on_policy(state_2, env) one_hot_action = one_hot(action_id, nr_actions) new_state, reward, terminated_2 = env.step(action_id, state_2)
def prepare_learning_materials(self, events, env): ''' Creating the y vector for learning. The y vector is y(s,c,a) := r(s,c,a) + gamma * Q_t(s',c', argmax_a'(Q(s',c', a')) with Q_t -- the target net Keyword arguments: events -- a list of events env -- the environment returns: y vector ''' debug = False nr_samples = len(events) s_primes = [x.scaled_state_prime for x in events] s_primes = np.array(s_primes) s_primes = np.reshape(s_primes, (nr_samples, -1)) c_primes = [x.scaled_comm_prime for x in events] c_primes = np.array(c_primes) c_primes = np.reshape(c_primes, (nr_samples, -1)) r = [x.reward for x in events] r = np.array(r) r = np.reshape(r, (nr_samples, 1)) done = [x.done for x in events] done = np.array(done) done = np.reshape(done, (nr_samples, 1)) nr_actions = env.nr_actions if (debug): pdb.set_trace() for action_id in range(nr_actions): action = one_hot(action_id, nr_actions=env.nr_actions) actions = np.full((nr_samples, nr_actions), action) inputs_for_Q = { 'Q_input_sa': np.concatenate((s_primes, actions), axis=1), 'Q_input_comm': c_primes} if action_id == 0: tmp = self.Q.predict(inputs_for_Q) else: tmp = np.concatenate((tmp, self.Q.predict(inputs_for_Q)), axis=1) tmp = np.argmax(tmp, axis=1) if (debug): pdb.set_trace() inputs_for_Q_t = { 'Qt_input_sa': np.concatenate((s_primes, tf.one_hot(tmp, depth=nr_actions)), axis=1), 'Qt_input_comm': c_primes} y = r + self.gamma * self.Q_t.predict(inputs_for_Q_t) * (1 - done) return y
def action_based_on_Q_target(self, agent_state, comm, env, epsilon): ''' takes an action based on the epsilon greedy policy using the Q-target 1 - for each agent_state/comm checks the predicted Q values for all the actions 2 - pick the largest Q value 3 - pick an action based on the largest Q value and epsilon Keyword arguments: agent_state -- current agent_state env -- the environment epsilon -- the epsilon in epsilon greedy approach returns: the id of the chosen action ''' debug = False nr_samples = 1 nr_actions = self.nr_actions scaled_state = scale_state(agent_state, env) scaled_state = np.array(scaled_state) scaled_state = np.reshape(scaled_state, (nr_samples, -1)) # this part might need to change, depending on what is the communicated. scaled_comm = scale_state(comm, env) scaled_comm = np.array(scaled_comm) scaled_comm = np.reshape(scaled_comm, (nr_samples, -1)) if debug: print("scaled_state", scaled_state) pdb.set_trace() for action_id in range(nr_actions): action = one_hot(action_id, nr_actions=nr_actions) inputs_for_Q_t = { 'Qt_input_sa': np.concatenate((scaled_state, action), axis=1), 'Qt_input_comm': scaled_comm} if action_id == 0: tmp = self.Q_t.predict(inputs_for_Q_t) if debug: print("the predicted Q for action", action_id, " is ", tmp) else: tmp = np.concatenate((tmp, self.Q_t.predict(inputs_for_Q_t)), axis=1) if debug: print("the predicted Q for action", action_id, " is ", tmp) tmp = tmp[0] probabilities = tf.math.softmax(tmp) probabilities = (probabilities + epsilon) / (1.0 + epsilon * nr_actions) probabilities = probabilities.numpy() probabilities = probabilities / np.sum(probabilities) if debug: print(probabilities, np.sum(probabilities) - 1) chosen_act = np.random.choice(nr_actions, p=probabilities) if debug: print("chosen_act", chosen_act) pdb.set_trace() return chosen_act