コード例 #1
0
 def test_memory(self):
     memory = ActionReplayMemory(1000000, 4)
     # index = 0
     # while(index < 100000):
     #     axr = np.random.randint(0,100,(84,84,4))
     #     memory.append(axr,4,5)
     #     sys.stdout.write('\r{}/{}'.format(index,1000000))
     #     sys.stdout.flush()
     #     index += 1
     print(memory.size())
コード例 #2
0
    def test_detail(self):

        memory = ActionReplayMemory(250, 4)  #test memory
        memory_old = ActionReplayMemoryOld(250, 4)
        index = 0

        h_prep = HistoryPreprocessor(4)
        np_prep = NumpyPreprocessor()
        preprocessors = PreprocessorSequence([h_prep, np_prep])

        for x in range(0, 1000):
            axr = np.random.randint(0, 100, (84, 84))
            prep_state = preprocessors.process_state_for_memory(axr)

            memory.append(prep_state, 4, 5)
            memory_old.append(prep_state, 4, 5)

        for t in range(0, 10):
            batch_size = 32
            indexes = (np.random.randint(0,
                                         memory._filled_size,
                                         size=batch_size)).tolist()
            curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample(
                batch_size, indexes)
            curr_arr2, next_arr2, reward_arr2, action_arr2, terminal_arr2 = memory_old.sample(
                batch_size, indexes)
            for i, terminal in enumerate(terminal_arr):
                empty_arr = np.zeros((84, 84))
                for d in range(0, 4):
                    self.assertTrue(not np.all(curr_arr[i][:, :,
                                                           d] == empty_arr))
                    self.assertTrue(
                        np.all(curr_arr[i][:, :, d] == curr_arr2[i][:, :, d]))

                if (indexes[i] >= 4):
                    self.assertTrue(
                        np.all(curr_arr[i][:, :,
                                           1] == memory.survey(indexes[i] -
                                                               1)))
                    self.assertTrue(
                        np.all(curr_arr[i][:, :,
                                           2] == memory.survey(indexes[i] -
                                                               2)))
                    self.assertTrue(
                        np.all(curr_arr[i][:, :,
                                           3] == memory.survey(indexes[i] -
                                                               3)))
                self.assertTrue(
                    np.all(curr_arr[i][:, :, 0] == curr_arr2[i][:, :, 0]))
コード例 #3
0
    def test_seq(self):
        memory = ActionReplayMemory(250, 4)  #test memory
        index = 0
        for x in range(0, 1000):
            axr = np.random.randint(0, 100, (84, 84, 4))
            memory.append(axr, 4, 5)

        for i in range(0, 10):
            curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample(
                10)
            for i, terminal in enumerate(terminal_arr):
                empty_arr = np.zeros((84, 84))
                for d in range(0, 4):
                    self.assertTrue(not np.all(curr_arr[i][:, :,
                                                           d] == empty_arr))
コード例 #4
0
    def test_memory(self):
        memory = ActionReplayMemory(250, 4)  #test memory
        index = 0
        while (index < 1000):
            axr = np.random.randint(0, 100, (84, 84, 4))
            memory.append(axr, 4, 5)
            if ((index + 1) % 50 == 0):
                axr = np.random.randint(0, 100, (84, 84, 4))
                memory.end_episode(axr, True)
                index += 1
            index += 1

        for i in range(0, 10):
            #some sampling tests
            curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample(
                10)
            for i, terminal in enumerate(terminal_arr):
                self.assertTrue(
                    np.all(curr_arr[i][:, :, 0] == next_arr[i][:, :, 1]))
            self.assertTrue(np.sum(reward_arr - 5) == 0)
            self.assertTrue(np.sum(action_arr - 4) == 0)
コード例 #5
0
def main():

    #env = gym.make("Enduro-v0")
    #env = gym.make("SpaceInvaders-v0")
    #env = gym.make("Breakout-v0")

    model_name = "result-q4"
    if (len(sys.argv) >= 2):
        model_name = sys.argv[1]

    if (len(sys.argv) >= 3):
        env = gym.make(sys.argv[2])
    else:
        #env = gym.make("Enduro-v0")
        env = gym.make("SpaceInvaders-v0")
        #env = gym.make("Breakout-v0")

    #no skip frames
    env.frameskip = 1

    input_shape = (84, 84)
    batch_size = 32
    num_actions = env.action_space.n
    memory_size = 1000000
    memory_burn_in_num = 50000
    start_epsilon = 1
    end_epsilon = 0.01
    decay_steps = 1000000
    target_update_freq = 10000
    train_freq = 4  #How often you train the network
    history_size = 4

    history_prep = HistoryPreprocessor(history_size)
    atari_prep = AtariPreprocessor(input_shape, 0, 999)
    numpy_prep = NumpyPreprocessor()
    preprocessors = PreprocessorSequence(
        [atari_prep, history_prep, numpy_prep])  #from left to right

    policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon,
                                            decay_steps)

    linear_model = create_model(history_size, input_shape, num_actions,
                                model_name)
    linear_model.summary()
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0.0)
    loss_func = huber_loss
    #linear_model.compile(optimizer, loss_func)

    random_policy = UniformRandomPolicy(num_actions)
    #memory = ActionReplayMemory(1000000,4)
    memory = ActionReplayMemory(memory_size, 4)
    memory_burn_in(env, memory, preprocessors, memory_burn_in_num,
                   random_policy)

    #print(reward_arr)
    #print(curr_state_arr)
    agent = DoubleQNAgent(linear_model, preprocessors, memory, policy, 0.99,
                          target_update_freq, None, train_freq, batch_size)
    agent.compile(optimizer, loss_func)
    agent.save_models()
    agent.fit(env, 1000000, 100000)