def test(ag1, ag2, render=False, load_model=False):
    test_env = CMOTP()
    if load_model:
        ag1.load_model()
        ag2.load_model()
    ep_log = []
    for iii in range(TEST_NUM):
        print('test {}'.format(iii))
        state1, state2 = test_env.reset()
        ep_len = 0
        while True:
            if render:
                test_env.render()
                time.sleep(1)
            action1 = ag1.choose_action(state1, 0, 0.0)
            action2 = ag2.choose_action(state2, 0, 0.0)
            next_state_, reward_, done_, _ = test_env.step([action1, action2])
            next_state1, next_state2 = next_state_
            state1 = next_state1
            state2 = next_state2
            ep_len += 1
            if done_[0] or ep_len >= 1000:
                ep_log.append(ep_len)
                break
    return np.mean(ep_log)
Exemple #2
0
def main():
    env = CMOTP()

    lucky_no = 5
    set_seed(lucky_no)

    erm_size = 10000

    agent1 = LenientDQNAgent(env, [256, 256],
                             'LenientAgent1',
                             learning_rate=1e-4,
                             replay_memory_size=erm_size,
                             use_tau=True,
                             tau=1e-3,
                             logdir='logs1',
                             savedir='save1',
                             batch_size=50)

    agent2 = LenientDQNAgent(env, [256, 256],
                             'LenientAgent2',
                             learning_rate=1e-4,
                             replay_memory_size=erm_size,
                             use_tau=True,
                             tau=1e-3,
                             logdir='logs2',
                             savedir='save2',
                             batch_size=50)
    print('after init')
    begintime = time.time()

    if TRAIN:

        episodes_recorder = []
        train_log = []
        train_num = 0

        for i in range(TRAIN_EPISODES):
            state1, state2 = env.reset()
            episode_len = 0
            episode1, episode2 = [], []

            while True:
                action1 = agent1.choose_action(state1)
                action2 = agent2.choose_action(state2)
                next_state, reward, done, _ = env.step([action1, action2])
                next_state1, next_state2 = next_state
                reward1, reward2 = reward
                done1, done2 = done
                leniency1 = agent1.leniency_calculator.calc_leniency(
                    agent1.temp_recorder.get_state_action_temp(
                        state1, action1))
                leniency2 = agent2.leniency_calculator.calc_leniency(
                    agent2.temp_recorder.get_state_action_temp(
                        state2, action2))

                # print('state: ', state1, state2)
                # print('action: ', action1, action2)
                # print('reward: ', reward1, reward2)
                # print('done: ', done1, done2)
                # print('next_state: ', next_state1, next_state2)
                # print('leniencies: ', leniency1, leniency2)

                agent1.store(state1, action1, reward1, next_state1, done1,
                             leniency1)
                agent2.store(state2, action2, reward2, next_state2, done2,
                             leniency2)

                episode1.append((state1, action1))
                episode2.append((state2, action2))

                agent1.train()
                agent2.train()
                train_num += 1

                episode_len += 1

                state1 = next_state1
                state2 = next_state2

                if done1:
                    this_train_log = (train_num, i,
                                      agent1.temp_recorder.get_ave_temp(),
                                      episode_len)
                    train_log.append(this_train_log)
                    print('train_cnt: {}, episode: {}, ave_temp: {}, len: {}'.
                          format(*this_train_log))
                    episodes_recorder.append(episode_len)

                    agent1.temp_recorder.show_temp(big=True, narrow=True)

                    if i > 0 and i % 100 == 0:
                        print('testing...')
                        print(
                            'average episode length: ',
                            test(agent1,
                                 agent2,
                                 render=False,
                                 load_model=False))

                    break

            agent1.temp_recorder.decay_temp(episode1)
            agent2.temp_recorder.decay_temp(episode2)

        endtime = time.time()
        print('training time: {}'.format(endtime - begintime))

        # np.save('{}-{}.npy'.format(erm_size, lucky_no), episodes_recorder)
        np.save('train_log_{}_{}.npy'.format(erm_size, lucky_no), train_log)
    else:
        test(agent1, agent2, render=True, load_model=True)

    env.close()
Exemple #3
0
            dns_1 = np.asarray(dns_1, dtype=np.bool).flatten()
            ln_1 = np.asarray(ln_1, dtype=np.float32).flatten()

            sts_2 = np.asarray(sts_2, dtype=np.float32).reshape(train_input_shape)
            acts_2 = np.asarray(acts_2, dtype=np.int32).flatten()
            rwds_2 = np.asarray(rwds_2, dtype=np.float32).flatten()
            n_sts_2 = np.asarray(n_sts_2, dtype=np.float32).reshape(train_input_shape)
            dns_2 = np.asarray(dns_2, dtype=np.bool).flatten()
            ln_2 = np.asarray(ln_2, dtype=np.float32).flatten()

            # train
            agent1.train_without_replaybuffer(sts_1, acts_1, rwds_1, ln_1)
            agent2.train_without_replaybuffer(sts_2, acts_2, rwds_2, ln_2)

    else:
        test_env = CMOTP()
        agent1.load_model()
        agent2.load_model()
        for i in range(TEST_NUM):
            state1, state2 = test_env.reset()
            while True:
                test_env.render()
                time.sleep(1)
                action1 = agent1.choose_action(state1, 0, 0.0)
                action2 = agent2.choose_action(state2, 0, 0.0)
                next_state, reward, done, _ = test_env.step([action1, action2])
                next_state1, next_state2 = next_state
                state1 = next_state1
                state2 = next_state2
                if done[0]:
                    break
Exemple #4
0
#             next_state, reward, done, _ = env.step(action_n)
#
#             len_this_episode += 1
#             state = next_state
#
#             if reward == 10.:
#                 break
#         print(i, len_this_episode)
#         len_episodes.append(len_this_episode)
#
#     print(np.mean(len_episodes))

from Env.cmotp_IL import CMOTP
import time
if __name__ == '__main__':
    env = CMOTP()
    state = env.reset()
    env.map = [[-1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [-1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, -1],
               [0, 0, 0, 0, 2, 3, 1, 0, 0, 0, 0, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1],
               [-1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1]]
    env.render()
    time.sleep(100)