def __init__(self, state, comm, action_id, reward, state_prime, comm_prime,
                 done, env):
        # converting the state, state_prime to scaled ones
        scaled_state = scale_state(state, env)
        scaled_state_prime = scale_state(state_prime, env)

        # This might need to be changed based on what is communicated
        scaled_comm = scale_state(comm, env)
        scaled_comm_prime = scale_state(comm_prime, env)
        # one_hot the action
        action = one_hot(action_id, nr_actions=env.nr_actions)

        self.scaled_state = scaled_state
        self.scaled_comm = scaled_comm
        self.action = action
        self.reward = reward
        self.scaled_state_prime = scaled_state_prime
        self.scaled_comm_prime = scaled_comm_prime
        self.done = done
Beispiel #2
0
    state = np.array([[0, 0]])
    state_history = []
    reward_history = []
    action_history = []
    env.terminated = False
    steps = 0

    while not env.terminated:

        action_id = agent.action_based_on_policy(state)

        new_state, reward = env.step(action_id, state)

        reward_history.append(reward)
        state_history.append(state)
        action_history.append(one_hot(action_id, 4))

        state = new_state + 0
        steps += 1

    print("...     terminated at: " + str(steps))

    print("...     reshaping the data")
    state_history = shape_adopter(state_history, 2)
    action_history = shape_adopter(action_history, 4)
    reward_history = shape_adopter(reward_history, 1)

    reward_to_go = calulate_reward_to_go(reward_history, gamma=0.95)

    reward_weighted_actions = np.multiply(action_history, reward_to_go)
Beispiel #3
0
print("...    initial state is "+str(initial_state))

state_1, terminated_1, steps_1 = initializer(initial_state[0, 0], initial_state[0, 1])
state_2, terminated_2, steps_2 = initializer(initial_state[0, 0], initial_state[0, 1])

step = 0
filename = ''
animations_dir = 'animations/'
os.makedirs(animations_dir)

while not terminated_1 and not terminated_2:

    # the first agent
    # print("agent 1")
    action_id = agent_1.action_based_on_policy(state_1, env)
    one_hot_action = one_hot(action_id, nr_actions)
    new_state, reward, terminated_1 = env.step(action_id, state_1)
    scaled_state_1 = scale_state(state_1, env)
    #histories_1.appending(reward, scaled_state_1, one_hot_action)
    plt.scatter(state_1[0, 0], state_1[0, 1], s=100, c='#C1C7C9', marker='s')
    plt.scatter(new_state[0, 0], new_state[0, 1], s=50, c='red')
    plt.show()
    plt.pause(0.1)

    state_1, steps_1 = update_state_step(new_state, steps_1)

    # the second agent
    # print("agent 2")
    action_id = agent_2.action_based_on_policy(state_2, env)
    one_hot_action = one_hot(action_id, nr_actions)
    new_state, reward, terminated_2 = env.step(action_id, state_2)
    def prepare_learning_materials(self, events, env):
        '''
        Creating the y vector for learning.
        The y vector is
        y(s,c,a) := r(s,c,a) + gamma * Q_t(s',c', argmax_a'(Q(s',c', a'))
        with  Q_t -- the target net

        Keyword arguments:
        events -- a list of events
        env -- the environment

        returns:
        y vector
        '''
        debug = False

        nr_samples = len(events)

        s_primes = [x.scaled_state_prime for x in events]
        s_primes = np.array(s_primes)
        s_primes = np.reshape(s_primes, (nr_samples, -1))

        c_primes = [x.scaled_comm_prime for x in events]
        c_primes = np.array(c_primes)
        c_primes = np.reshape(c_primes, (nr_samples, -1))

        r = [x.reward for x in events]
        r = np.array(r)
        r = np.reshape(r, (nr_samples, 1))

        done = [x.done for x in events]
        done = np.array(done)
        done = np.reshape(done, (nr_samples, 1))

        nr_actions = env.nr_actions

        if (debug):
            pdb.set_trace()

        for action_id in range(nr_actions):
            action = one_hot(action_id, nr_actions=env.nr_actions)
            actions = np.full((nr_samples, nr_actions), action)
            inputs_for_Q = {
                'Q_input_sa': np.concatenate((s_primes, actions), axis=1),
                'Q_input_comm': c_primes}

            if action_id == 0:
                tmp = self.Q.predict(inputs_for_Q)
            else:
                tmp = np.concatenate((tmp, self.Q.predict(inputs_for_Q)), axis=1)

        tmp = np.argmax(tmp, axis=1)

        if (debug):
            pdb.set_trace()

        inputs_for_Q_t = {
            'Qt_input_sa': np.concatenate((s_primes, tf.one_hot(tmp, depth=nr_actions)), axis=1),
            'Qt_input_comm': c_primes}

        y = r + self.gamma * self.Q_t.predict(inputs_for_Q_t) * (1 - done)

        return y
    def action_based_on_Q_target(self, agent_state, comm, env, epsilon):
        '''
        takes an action based on the epsilon greedy policy using the Q-target
        1 - for each agent_state/comm checks the predicted Q values for all the actions
        2 - pick the largest Q value
        3 - pick an action based on the largest Q value and epsilon

        Keyword arguments:

        agent_state -- current agent_state
        env -- the environment
        epsilon -- the epsilon in epsilon greedy approach

        returns:

        the id of the chosen action
        '''

        debug = False
        nr_samples = 1
        nr_actions = self.nr_actions
        scaled_state = scale_state(agent_state, env)
        scaled_state = np.array(scaled_state)
        scaled_state = np.reshape(scaled_state, (nr_samples, -1))

        # this part might need to change, depending on what is the communicated.
        scaled_comm = scale_state(comm, env)
        scaled_comm = np.array(scaled_comm)
        scaled_comm = np.reshape(scaled_comm, (nr_samples, -1))

        if debug:
            print("scaled_state", scaled_state)
            pdb.set_trace()

        for action_id in range(nr_actions):

            action = one_hot(action_id, nr_actions=nr_actions)

            inputs_for_Q_t = {
                'Qt_input_sa': np.concatenate((scaled_state, action), axis=1),
                'Qt_input_comm': scaled_comm}

            if action_id == 0:
                tmp = self.Q_t.predict(inputs_for_Q_t)
                if debug:
                    print("the predicted Q for action", action_id, " is ", tmp)
            else:
                tmp = np.concatenate((tmp, self.Q_t.predict(inputs_for_Q_t)), axis=1)
                if debug:
                    print("the predicted Q for action", action_id, " is ", tmp)

        tmp = tmp[0]
        probabilities = tf.math.softmax(tmp)
        probabilities = (probabilities + epsilon) / (1.0 + epsilon * nr_actions)
        probabilities = probabilities.numpy()
        probabilities = probabilities / np.sum(probabilities)
        if debug:
            print(probabilities, np.sum(probabilities) - 1)
        chosen_act = np.random.choice(nr_actions, p=probabilities)
        if debug:
            print("chosen_act", chosen_act)
            pdb.set_trace()

        return chosen_act