Beispiel #1
0
#np.random.seed(2)

plt.close('all')
plt.ion()
n = SYSTEM_SIZE
plt.scatter(n-1, n-1, s=100, c='orange', marker='s')
plt.axis([-1, n, -1, n])
plt.axes().set_aspect('equal')
plt.scatter(0, 0, c='red')


initial_state = np.array([[0, 0]])

print("...    initial state is "+str(initial_state))

state_1, terminated_1, steps_1 = initializer(initial_state[0, 0], initial_state[0, 1])
state_2, terminated_2, steps_2 = initializer(initial_state[0, 0], initial_state[0, 1])

step = 0
filename = ''
animations_dir = 'animations/'
os.makedirs(animations_dir)

while not terminated_1 and not terminated_2:

    # the first agent
    # print("agent 1")
    action_id = agent_1.action_based_on_policy(state_1, env)
    one_hot_action = one_hot(action_id, nr_actions)
    new_state, reward, terminated_1 = env.step(action_id, state_1)
    scaled_state_1 = scale_state(state_1, env)
for u in range(U):

    print("update Q-t round :" + str(u))

    epsilon = max(0.001, 0.1 - (0.1 - 0.001) * (u / U))

    print("...    epsilon in exploring is :" + str(epsilon * 100) +
          "%,  buffer_size is " + str(replay_buffer.size))

    for n in range(N):
        print("...    update the Q and replay_buffer :" + str(n))

        initial_state = env.reset()

        state, terminated, steps = initializer(initial_state)

        state = single_shape_adaptor(state, nr_features)

        while not terminated:

            action_id = agent_ler.action_based_on_Q_target(state,
                                                           env,
                                                           epsilon=epsilon)

            new_state, reward, terminated, info = env.step(action_id)

            new_state = single_shape_adaptor(new_state, nr_features)

            this_event = event(state, action_id, reward, new_state, terminated,
                               env)
        str(timestep) + '-' + str(agent_id))
    agent_lst[agent_id].update_Q_to_Q_t()

# lets simulate

global_state_space_size = len(env.states)
nr_features_per_agent = agent_lst[0].nr_features
nr_comm_features_per_agent = agent_lst[0].nr_features
# + agent_lst[0].nr_actions

initial_state_global = env.reset()

for agent_id in range(len(agent_lst)):
    agent_lst[agent_id].terminated = False

state_global, terminated, steps = initializer(initial_state_global)
state_global = single_shape_adaptor(state_global, global_state_space_size)

while not terminated:

    action_id_lst = []
    # choosing actions for each agent
    for agent_id in range(env.nr_agents):
        agent = agent_lst[agent_id]
        state, comm = extract_state_and_comm_from_global_state(
            state_global, agent_id)
        action_id = agent.action_based_on_Q_target(state,
                                                   comm,
                                                   env,
                                                   epsilon=10)
        if just_forward:
expr_per_learn = 1  #env.SYSTEM_SIZE * env.SYSTEM_SIZE * nr_actions

agent.policy.save('./training-results/not-trained-agent-system-size-' +
                  str(SYSTEM_SIZE))

for training_id in range(ROUNDS_OF_TRAINING):

    print("\nround: " + str(training_id))

    # finding a initial state which is not the terminal_state
    initial_state = find_initial_state(env)
    #initial_state = np.array([[0, 0]])
    print("...    initial state is " + str(initial_state))

    state, terminated, steps = initializer(initial_state[0, 0],
                                           initial_state[0, 1])

    while not terminated:

        action_id = agent.action_based_on_policy(state, env)
        one_hot_action = one_hot(action_id, nr_actions)

        new_state, reward, terminated = env.step(action_id, state)

        scaled_state = scale_state(state, env)

        histories.appending(reward, scaled_state, one_hot_action)

        #print("state", state, "scla", scaled_state, "one_hot_action", one_hot_action, "new_state", new_state, "reward", reward)
        state, steps = update_state_step(new_state, steps)
def testing_performance(nr_episodes, agents_lst, env, epsilon):
    ''' runs a number of episodes and returns the average performance
    The actions that the agent took are based on the Q-target network with epsilon greedy approach

    Keyword arguments:

    rounds_data_exploration -- number of experiment rounds done
    agent -- the agent
    main_buffer -- the replay buffer
    env -- environement
    epsilon -- the epsilon for the epsilon greedy approach

    returns:

    the the total rewards and the individual rewards
    '''
    print("...    the test is started")

    accumulative_rewards = np.zeros((1, env.nr_agents))
    global_state_space_size = len(env.states)

    for training_id in range(nr_episodes):

        print("\nround: " + str(training_id))

        initial_state_global = env.reset()

        state_global, terminated, steps = initializer(initial_state_global)
        state_global = single_shape_adaptor(state_global,
                                            global_state_space_size)

        while not terminated:

            action_id_lst = []
            # choosing actions for each agent
            for agent_id in range(env.nr_agents):
                agent = agents_lst[agent_id]
                state, comm = extract_state_and_comm_from_global_state(
                    state_global, agent_id)
                action_id = agent.action_based_on_Q_target(
                    state, comm, env, epsilon)
                action_id_lst.append(action_id)

            # taking a step with all the agents
            new_state_global, reward_lst, terminated_lst, _ = env.step(
                action_id_lst)
            new_state_global = single_shape_adaptor(new_state_global,
                                                    global_state_space_size)

            accumulative_rewards += reward_lst

            terminated = np.array(terminated_lst).all()
            state_global = new_state_global
            steps = steps + 1

        print("...    the terminal_state is reached after " + str(steps))

    accumulative_rewards = accumulative_rewards / (nr_episodes + 0.0)

    output = np.array(accumulative_rewards.flatten())
    output = np.append(output, np.sum(accumulative_rewards))
    return output
def update_replay_buffer_with_episodes(nr_episodes, agents_lst,
                                       replay_buffer_lst, env, epsilon):
    '''
    fills the main_buffer with events, i.e. (s, a, r, s', done)
    which are happened during some rounds of experiments
    for the agent. The actions that the agent took are based on the Q-target network with epsilon greedy approach

    Keyword arguments:

    rounds_data_exploration -- number of experiment rounds done
    agent -- the agent
    main_buffer -- the replay buffer
    env -- environement
    epsilon -- the epsilon for the epsilon greedy approach

    returns:

    the replay buffer
    '''

    global_state_space_size = len(env.states)
    nr_features_per_agent = agents_lst[0].nr_features
    nr_comm_features_per_agent = agents_lst[0].nr_features
    # + agents_lst[0].nr_actions

    for training_id in range(nr_episodes):

        print("\nround: " + str(training_id))

        initial_state_global = env.reset()

        for agent_id in range(len(agents_lst)):
            agents_lst[agent_id].terminated = False

        state_global, terminated, steps = initializer(initial_state_global)
        state_global = single_shape_adaptor(state_global,
                                            global_state_space_size)

        while not terminated:

            action_id_lst = []
            # choosing actions for each agent
            for agent_id in range(env.nr_agents):
                agent = agents_lst[agent_id]
                state, comm = extract_state_and_comm_from_global_state(
                    state_global, agent_id)
                action_id = agent.action_based_on_Q_target(
                    state, comm, env, epsilon)
                action_id_lst.append(action_id)

            # STEP : taking a step with all the agents
            new_state_global, reward_lst, terminated_lst, info = env.step(
                action_id_lst)
            new_state_global = single_shape_adaptor(new_state_global,
                                                    global_state_space_size)

            # extracting the current event for each agent and saving it in its replay buffer
            for agent_id in range(env.nr_agents):

                # extraction
                new_state, new_comm = extract_state_and_comm_from_global_state(
                    new_state_global, agent_id)
                state, comm = extract_state_and_comm_from_global_state(
                    state_global, agent_id)

                # reshaping ?
                state = single_shape_adaptor(state, nr_features_per_agent)
                comm = single_shape_adaptor(comm, nr_comm_features_per_agent)
                new_state = single_shape_adaptor(new_state,
                                                 nr_features_per_agent)
                new_comm = single_shape_adaptor(new_comm,
                                                nr_comm_features_per_agent)

                action_id = action_id_lst[agent_id]

                # putting the data in event format
                this_event = event(state, comm, action_id,
                                   reward_lst[agent_id], new_state, new_comm,
                                   terminated_lst[agent_id], env)
                if np.array_equal(this_event.action, np.array(
                    [[0, 1]])) and np.array_equal(
                        this_event.scaled_state, this_event.scaled_state_prime
                    ) and this_event.done == False:
                    import pdb
                    pdb.set_trace()
                # throwing the event into the replay buffer
                # as long as the agent is not terminated.
                # the next if which updates the state of the agent (i.e. agent.terminated)
                # is very important.
                # All these manuvers are done in order that the last step is written in the buffer,
                # but all the rest are not.
                # This is a multi-agent specific problem as the in the single agent case both the agent and
                # the environment finish at the same time.

                if agents_lst[agent_id].terminated == False:
                    replay_buffer_lst[agent_id].consider_this_event(this_event)

                if terminated_lst[agent_id] == True:
                    agents_lst[agent_id].terminated = True

            terminated = np.array(terminated_lst).all()
            state_global = new_state_global
            steps = steps + 1

        print("...    the terminal_state is reached after " + str(steps))

    return replay_buffer_lst